Skip to content
This repository has been archived by the owner on Mar 30, 2023. It is now read-only.

Commit

Permalink
Made dac-ansible pull v2.5 (#136)
Browse files Browse the repository at this point in the history
* Made dac-ansible pull v2.5

* Be clear about using python3 in dac-ansible

* Fix create servers for python3

* Add comment about reaching a jump host

This is useful when your network is isolated, and you access it
via a floating ip on one of the nodes.

* Add python3 virtualenv

* Fix certificate generation

* Ensure we gather all facts

* Try to avoid ssh timeout issues

* Move to latest lustre release 2.12.4

* Fix up ansible install instructions

* Fix slurm test script path

* Add ansible config for fsansible

* Make loops more consistent

Hopefully also fixing python3 compatibility for all the loops.

* Fix up name of delete playbook task

* Update lustre to pick up centos 7.8
  • Loading branch information
JohnGarbutt authored Jun 9, 2020
1 parent 6280e14 commit ae8f775
Show file tree
Hide file tree
Showing 10 changed files with 63 additions and 30 deletions.
6 changes: 4 additions & 2 deletions dac-ansible/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@ preferred.

Install Ansible and the OpenStack SDK, eg in a Python virtual environment:

sudo yum install python3-pip
sudo pip3 install virtualenv
virtualenv .venv
. .venv/bin/activate
pip install -U pip
pip install -U ansible openstacksdk
pip install -U ansible openstacksdk openstackclient

Pull in Ansible role dependencies:

Expand All @@ -34,7 +36,7 @@ Once the Ansible has finished, you can login and try a Slurm test:
ssh centos@<ip-of-slurm-master>
sudo -i
scontrol show burstbuffer
/usr/local/bin/data-acc/tools/slurm-test.sh
/usr/local/bin/data-acc-v<version>/tools/slurm-test.sh
squeue
scontrol show burstbuffer

Expand Down
7 changes: 7 additions & 0 deletions dac-ansible/ansible.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[defaults]
forks = 20
gathering = smart

[ssh_connection]
pipelining = True
retries = 3
24 changes: 15 additions & 9 deletions dac-ansible/create-servers.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,12 @@ def main():
servers['slurm-cpu2'] = create_server(
conn, 'slurm-cpu2', image, flavor, network)

inventory_template = """[dac_workers]
inventory_template = """[all:vars]
ansible_user=centos
# update if you have a jump host with a floating ip
#ansible_ssh_common_args='-C -o ControlMaster=auto -o ControlPersist=240s -o ProxyCommand="%s"'
[dac_workers]
dac1.dac.hpc.cam.ac.uk ansible_host=%s ansible_user=centos
dac2.dac.hpc.cam.ac.uk ansible_host=%s ansible_user=centos
dac3.dac.hpc.cam.ac.uk ansible_host=%s ansible_user=centos
Expand Down Expand Up @@ -89,14 +94,15 @@ def main():
slurm_master
slurm_workers"""

print inventory_template % (
servers['dac1'],
servers['dac2'],
servers['dac3'],
servers['dac-etcd'],
servers['dac-slurm-master'],
servers['slurm-cpu1'],
servers['slurm-cpu2'])
print(inventory_template % (
"ssh centos@jumphostip -W %h:%p",
servers['dac1'],
servers['dac2'],
servers['dac3'],
servers['dac-etcd'],
servers['dac-slurm-master'],
servers['slurm-cpu1'],
servers['slurm-cpu2']))


if __name__ == '__main__':
Expand Down
17 changes: 13 additions & 4 deletions dac-ansible/master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,36 +56,45 @@
pki_dir: /home/centos/pki-dir
pki_self_sign: True
pki_ca:
expiry: '87600h'
pathlen: 0
cname: ca.dac.hpc.cam.ac.uk
pki_servers:
- cname: dac-etcd.dac.hpc.cam.ac.uk
include_localhost: True
profile: client-server
sans:
- dac-etcd.dac.hpc.cam.ac.uk
altips:
- "{{ hostvars[groups['etcd_master'][0]].ansible_host }}"
- cname: dac1.dac.hpc.cam.ac.uk
include_localhost: True
profile: client-server
sans:
- dac1.dac.hpc.cam.ac.uk
- cname: dac2.dac.hpc.cam.ac.uk
include_localhost: True
profile: client-server
sans:
- dac2.dac.hpc.cam.ac.uk
- cname: dac3.dac.hpc.cam.ac.uk
include_localhost: True
include_localhos: True
profile: client-server
sans:
- dac3.dac.hpc.cam.ac.uk
- cname: dac-slurm-master.dac.hpc.cam.ac.uk
include_localhost: True
profile: client-server
sans:
- dac-slurm-master.dac.hpc.cam.ac.uk
- cname: slurm-cpu1.dac.hpc.cam.ac.uk
include_localhost: True
profile: client-server
sans:
- slurm-cpu1.dac.hpc.cam.ac.uk
- cname: slurm-cpu2.dac.hpc.cam.ac.uk
include_localhost: True
profile: client-server
sans:
- slurm-cpu2.dac.hpc.cam.ac.uk

Expand Down Expand Up @@ -141,7 +150,7 @@
- hosts: dac_workers:slurm_workers
become: True
vars:
lustre_release: "2.12.2"
lustre_release: "2.12.5"
tasks:
- name: enable lustre server repo
yum_repository:
Expand All @@ -168,7 +177,7 @@
- hosts: dac_workers:slurm_workers
become: True
vars:
lustre_release: "2.12.2"
lustre_release: "2.12.5"
tasks:
- name: Install Lustre Server
yum:
Expand All @@ -182,7 +191,7 @@
- hosts: dac_workers:slurm_workers
become: True
vars:
lustre_release: "2.12.2"
lustre_release: "2.12.5"
tasks:
- name: Install Lustre Client
yum:
Expand Down
4 changes: 2 additions & 2 deletions dac-ansible/roles/data-acc/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
data_acc_version: 'v2.0'
data_acc_checksum: 'sha256:7c4d4535b0402aa40b8b0558c10fc12ae25fb70364df6d9ecfd9c699d491b735'
data_acc_version: 'v2.5'
data_acc_checksum: 'sha256:0f32004fe9b0b1b1fdea48c21491be20f4cda0135dcfc4eec25ac381433a8322'
data_acc_platform: linux-amd64
data_acc_mirror: https://github.com/RSE-Cambridge/data-acc/releases/download
data_acc_install_dir: /usr/local/bin
Expand Down
5 changes: 3 additions & 2 deletions dac-ansible/roles/data-acc/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,17 @@
- dacd
- dacctl

- name: Install python-virtualenv
- name: Install python3-virtualenv
package:
name: python-virtualenv
name: python3-virtualenv
state: present
when: "'slurm' not in group_names"

- name: Add ansible venv
pip:
name: ansible
virtualenv: "{{data_acc_install_dir}}/{{data_acc_name}}/fs-ansible/.venv"
virtualenv_command: "virtualenv-3"
when: "'slurm' not in group_names"

- include_tasks: systemd.yml
Expand Down
8 changes: 8 additions & 0 deletions fs-ansible/ansible.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[defaults]
forks = 30
gathering = smart

[ssh_connection]
ssh_args = -o ControlMaster=auto -o ControlPersist=240s
pipelining = True
retries = 3
2 changes: 1 addition & 1 deletion fs-ansible/delete.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
- name: Create Lustre filesystem (format)
- name: Delete Lustre filesystem
hosts: all
any_errors_fatal: true
become: yes
Expand Down
12 changes: 6 additions & 6 deletions fs-ansible/roles/lustre/tasks/format.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
part_end: "{{ mdt_size_mb }}MB"
label: gpt
state: present
loop: "{{ (list(mdts.keys()) + list(osts.keys())) | unique }}"
loop: "{{ osts | combine(mdts) | list }}"
- name: Add OST Partition
parted:
device: "/dev/{{ item }}"
Expand All @@ -26,7 +26,7 @@
part_end: "100%"
label: gpt
state: present
loop: "{{ (list(mdts.keys()) + list(osts.keys())) | unique }}"
loop: "{{ osts | combine(mdts) | list }}"

- name: Format MDTs
block:
Expand All @@ -38,14 +38,14 @@
part_end: "{{ mdt_size_mb }}MB"
label: gpt
state: present
loop: "{{ mdts.keys() }}"
loop: "{{ mdts.keys() | list }}"

- name: Wait for MDT partition to appear in /dev
wait_for:
path: "/dev/{{ item }}p1"
timeout: 30
sleep: 5
loop: "{{ mdts.keys() }}"
loop: "{{ mdts.keys() | list }}"

- name: Reformat MDTs
command: "/usr/sbin/mkfs.lustre --mdt --reformat --fsname={{ fs_name }} --index={{ item.value }} --mgsnode={{ mgsnode }}{{ lnet_suffix }} /dev/{{ item.key }}p1"
Expand All @@ -61,14 +61,14 @@
part_end: "100%"
label: gpt
state: present
loop: "{{ osts.keys() }}"
loop: "{{ osts.keys() | list }}"

- name: Wait for OST partition to appear in /dev
wait_for:
path: "/dev/{{ item }}p2"
timeout: 30
sleep: 5
loop: "{{ osts.keys() }}"
loop: "{{ osts.keys() | list }}"

- name: Reformat OSTs
command: "/usr/sbin/mkfs.lustre --ost --reformat --fsname={{ fs_name }} --index={{ item.value }} --mgsnode={{ mgsnode }}{{ lnet_suffix }} /dev/{{ item.key }}p2"
Expand Down
8 changes: 4 additions & 4 deletions fs-ansible/roles/lustre/tasks/wipe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,27 @@
device: "/dev/{{ item }}"
number: 1
state: absent
loop: "{{ (list(mdts.keys()) + list(osts.keys())) | unique }}"
loop: "{{ osts | combine(mdts) | list }}"

- name: Remove old OST Partition
parted:
device: "/dev/{{ item }}"
number: 2
state: absent
loop: "{{ (list(mdts.keys()) + list(osts.keys())) | unique }}"
loop: "{{ osts | combine(mdts) | list }}"

- name: Wait for MDT partition to disappear from /dev
wait_for:
path: "/dev/{{ item }}p1"
state: absent
timeout: 120
sleep: 5
loop: "{{ (list(mdts.keys()) + list(osts.keys())) | unique }}"
loop: "{{ osts | combine(mdts) | list }}"

- name: Wait for OST partition to disappear from /dev
wait_for:
path: "/dev/{{ item }}p2"
state: absent
timeout: 120
sleep: 5
loop: "{{ (list(mdts.keys()) + list(osts.keys())) | unique }}"
loop: "{{ osts | combine(mdts) | list }}"

0 comments on commit ae8f775

Please sign in to comment.