From ae8f77565fd8b2b6d7d43e8c85187f6cc5920c3d Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Tue, 9 Jun 2020 16:10:34 +0100 Subject: [PATCH] Made dac-ansible pull v2.5 (#136) * Made dac-ansible pull v2.5 * Be clear about using python3 in dac-ansible * Fix create servers for python3 * Add comment about reaching a jump host This is useful when your network is isolated, and you access it via a floating ip on one of the nodes. * Add python3 virtualenv * Fix certificate generation * Ensure we gather all facts * Try to avoid ssh timeout issues * Move to latest lustre release 2.12.4 * Fix up ansible install instructions * Fix slurm test script path * Add ansible config for fsansible * Make loops more consistent Hopefully also fixing python3 compatibility for all the loops. * Fix up name of delete playbook task * Update lustre to pick up centos 7.8 --- dac-ansible/README.md | 6 +++-- dac-ansible/ansible.cfg | 7 ++++++ dac-ansible/create-servers.py | 24 ++++++++++++-------- dac-ansible/master.yml | 17 ++++++++++---- dac-ansible/roles/data-acc/defaults/main.yml | 4 ++-- dac-ansible/roles/data-acc/tasks/main.yml | 5 ++-- fs-ansible/ansible.cfg | 8 +++++++ fs-ansible/delete.yml | 2 +- fs-ansible/roles/lustre/tasks/format.yaml | 12 +++++----- fs-ansible/roles/lustre/tasks/wipe.yaml | 8 +++---- 10 files changed, 63 insertions(+), 30 deletions(-) create mode 100644 dac-ansible/ansible.cfg create mode 100644 fs-ansible/ansible.cfg diff --git a/dac-ansible/README.md b/dac-ansible/README.md index 5792d16d..fe84ee43 100644 --- a/dac-ansible/README.md +++ b/dac-ansible/README.md @@ -8,10 +8,12 @@ preferred. Install Ansible and the OpenStack SDK, eg in a Python virtual environment: + sudo yum install python3-pip + sudo pip3 install virtualenv virtualenv .venv . .venv/bin/activate pip install -U pip - pip install -U ansible openstacksdk + pip install -U ansible openstacksdk openstackclient Pull in Ansible role dependencies: @@ -34,7 +36,7 @@ Once the Ansible has finished, you can login and try a Slurm test: ssh centos@ sudo -i scontrol show burstbuffer - /usr/local/bin/data-acc/tools/slurm-test.sh + /usr/local/bin/data-acc-v/tools/slurm-test.sh squeue scontrol show burstbuffer diff --git a/dac-ansible/ansible.cfg b/dac-ansible/ansible.cfg new file mode 100644 index 00000000..a6d1e488 --- /dev/null +++ b/dac-ansible/ansible.cfg @@ -0,0 +1,7 @@ +[defaults] +forks = 20 +gathering = smart + +[ssh_connection] +pipelining = True +retries = 3 diff --git a/dac-ansible/create-servers.py b/dac-ansible/create-servers.py index 4081f5dc..6f88625a 100755 --- a/dac-ansible/create-servers.py +++ b/dac-ansible/create-servers.py @@ -59,7 +59,12 @@ def main(): servers['slurm-cpu2'] = create_server( conn, 'slurm-cpu2', image, flavor, network) - inventory_template = """[dac_workers] + inventory_template = """[all:vars] +ansible_user=centos +# update if you have a jump host with a floating ip +#ansible_ssh_common_args='-C -o ControlMaster=auto -o ControlPersist=240s -o ProxyCommand="%s"' + +[dac_workers] dac1.dac.hpc.cam.ac.uk ansible_host=%s ansible_user=centos dac2.dac.hpc.cam.ac.uk ansible_host=%s ansible_user=centos dac3.dac.hpc.cam.ac.uk ansible_host=%s ansible_user=centos @@ -89,14 +94,15 @@ def main(): slurm_master slurm_workers""" - print inventory_template % ( - servers['dac1'], - servers['dac2'], - servers['dac3'], - servers['dac-etcd'], - servers['dac-slurm-master'], - servers['slurm-cpu1'], - servers['slurm-cpu2']) + print(inventory_template % ( + "ssh centos@jumphostip -W %h:%p", + servers['dac1'], + servers['dac2'], + servers['dac3'], + servers['dac-etcd'], + servers['dac-slurm-master'], + servers['slurm-cpu1'], + servers['slurm-cpu2'])) if __name__ == '__main__': diff --git a/dac-ansible/master.yml b/dac-ansible/master.yml index e4c3bbc6..1faea6fd 100644 --- a/dac-ansible/master.yml +++ b/dac-ansible/master.yml @@ -56,36 +56,45 @@ pki_dir: /home/centos/pki-dir pki_self_sign: True pki_ca: + expiry: '87600h' + pathlen: 0 cname: ca.dac.hpc.cam.ac.uk pki_servers: - cname: dac-etcd.dac.hpc.cam.ac.uk include_localhost: True + profile: client-server sans: - dac-etcd.dac.hpc.cam.ac.uk altips: - "{{ hostvars[groups['etcd_master'][0]].ansible_host }}" - cname: dac1.dac.hpc.cam.ac.uk include_localhost: True + profile: client-server sans: - dac1.dac.hpc.cam.ac.uk - cname: dac2.dac.hpc.cam.ac.uk include_localhost: True + profile: client-server sans: - dac2.dac.hpc.cam.ac.uk - cname: dac3.dac.hpc.cam.ac.uk - include_localhost: True + include_localhos: True + profile: client-server sans: - dac3.dac.hpc.cam.ac.uk - cname: dac-slurm-master.dac.hpc.cam.ac.uk include_localhost: True + profile: client-server sans: - dac-slurm-master.dac.hpc.cam.ac.uk - cname: slurm-cpu1.dac.hpc.cam.ac.uk include_localhost: True + profile: client-server sans: - slurm-cpu1.dac.hpc.cam.ac.uk - cname: slurm-cpu2.dac.hpc.cam.ac.uk include_localhost: True + profile: client-server sans: - slurm-cpu2.dac.hpc.cam.ac.uk @@ -141,7 +150,7 @@ - hosts: dac_workers:slurm_workers become: True vars: - lustre_release: "2.12.2" + lustre_release: "2.12.5" tasks: - name: enable lustre server repo yum_repository: @@ -168,7 +177,7 @@ - hosts: dac_workers:slurm_workers become: True vars: - lustre_release: "2.12.2" + lustre_release: "2.12.5" tasks: - name: Install Lustre Server yum: @@ -182,7 +191,7 @@ - hosts: dac_workers:slurm_workers become: True vars: - lustre_release: "2.12.2" + lustre_release: "2.12.5" tasks: - name: Install Lustre Client yum: diff --git a/dac-ansible/roles/data-acc/defaults/main.yml b/dac-ansible/roles/data-acc/defaults/main.yml index c2d77125..56c7e452 100644 --- a/dac-ansible/roles/data-acc/defaults/main.yml +++ b/dac-ansible/roles/data-acc/defaults/main.yml @@ -1,6 +1,6 @@ --- -data_acc_version: 'v2.0' -data_acc_checksum: 'sha256:7c4d4535b0402aa40b8b0558c10fc12ae25fb70364df6d9ecfd9c699d491b735' +data_acc_version: 'v2.5' +data_acc_checksum: 'sha256:0f32004fe9b0b1b1fdea48c21491be20f4cda0135dcfc4eec25ac381433a8322' data_acc_platform: linux-amd64 data_acc_mirror: https://github.com/RSE-Cambridge/data-acc/releases/download data_acc_install_dir: /usr/local/bin diff --git a/dac-ansible/roles/data-acc/tasks/main.yml b/dac-ansible/roles/data-acc/tasks/main.yml index ce61a4df..154127e2 100644 --- a/dac-ansible/roles/data-acc/tasks/main.yml +++ b/dac-ansible/roles/data-acc/tasks/main.yml @@ -26,9 +26,9 @@ - dacd - dacctl -- name: Install python-virtualenv +- name: Install python3-virtualenv package: - name: python-virtualenv + name: python3-virtualenv state: present when: "'slurm' not in group_names" @@ -36,6 +36,7 @@ pip: name: ansible virtualenv: "{{data_acc_install_dir}}/{{data_acc_name}}/fs-ansible/.venv" + virtualenv_command: "virtualenv-3" when: "'slurm' not in group_names" - include_tasks: systemd.yml diff --git a/fs-ansible/ansible.cfg b/fs-ansible/ansible.cfg new file mode 100644 index 00000000..8c570556 --- /dev/null +++ b/fs-ansible/ansible.cfg @@ -0,0 +1,8 @@ +[defaults] +forks = 30 +gathering = smart + +[ssh_connection] +ssh_args = -o ControlMaster=auto -o ControlPersist=240s +pipelining = True +retries = 3 diff --git a/fs-ansible/delete.yml b/fs-ansible/delete.yml index fc8458de..7085e324 100644 --- a/fs-ansible/delete.yml +++ b/fs-ansible/delete.yml @@ -1,5 +1,5 @@ --- -- name: Create Lustre filesystem (format) +- name: Delete Lustre filesystem hosts: all any_errors_fatal: true become: yes diff --git a/fs-ansible/roles/lustre/tasks/format.yaml b/fs-ansible/roles/lustre/tasks/format.yaml index 7b4bd3b7..c98f08a3 100644 --- a/fs-ansible/roles/lustre/tasks/format.yaml +++ b/fs-ansible/roles/lustre/tasks/format.yaml @@ -17,7 +17,7 @@ part_end: "{{ mdt_size_mb }}MB" label: gpt state: present - loop: "{{ (list(mdts.keys()) + list(osts.keys())) | unique }}" + loop: "{{ osts | combine(mdts) | list }}" - name: Add OST Partition parted: device: "/dev/{{ item }}" @@ -26,7 +26,7 @@ part_end: "100%" label: gpt state: present - loop: "{{ (list(mdts.keys()) + list(osts.keys())) | unique }}" + loop: "{{ osts | combine(mdts) | list }}" - name: Format MDTs block: @@ -38,14 +38,14 @@ part_end: "{{ mdt_size_mb }}MB" label: gpt state: present - loop: "{{ mdts.keys() }}" + loop: "{{ mdts.keys() | list }}" - name: Wait for MDT partition to appear in /dev wait_for: path: "/dev/{{ item }}p1" timeout: 30 sleep: 5 - loop: "{{ mdts.keys() }}" + loop: "{{ mdts.keys() | list }}" - name: Reformat MDTs command: "/usr/sbin/mkfs.lustre --mdt --reformat --fsname={{ fs_name }} --index={{ item.value }} --mgsnode={{ mgsnode }}{{ lnet_suffix }} /dev/{{ item.key }}p1" @@ -61,14 +61,14 @@ part_end: "100%" label: gpt state: present - loop: "{{ osts.keys() }}" + loop: "{{ osts.keys() | list }}" - name: Wait for OST partition to appear in /dev wait_for: path: "/dev/{{ item }}p2" timeout: 30 sleep: 5 - loop: "{{ osts.keys() }}" + loop: "{{ osts.keys() | list }}" - name: Reformat OSTs command: "/usr/sbin/mkfs.lustre --ost --reformat --fsname={{ fs_name }} --index={{ item.value }} --mgsnode={{ mgsnode }}{{ lnet_suffix }} /dev/{{ item.key }}p2" diff --git a/fs-ansible/roles/lustre/tasks/wipe.yaml b/fs-ansible/roles/lustre/tasks/wipe.yaml index 8ec39e9c..7282c842 100644 --- a/fs-ansible/roles/lustre/tasks/wipe.yaml +++ b/fs-ansible/roles/lustre/tasks/wipe.yaml @@ -10,14 +10,14 @@ device: "/dev/{{ item }}" number: 1 state: absent - loop: "{{ (list(mdts.keys()) + list(osts.keys())) | unique }}" + loop: "{{ osts | combine(mdts) | list }}" - name: Remove old OST Partition parted: device: "/dev/{{ item }}" number: 2 state: absent - loop: "{{ (list(mdts.keys()) + list(osts.keys())) | unique }}" + loop: "{{ osts | combine(mdts) | list }}" - name: Wait for MDT partition to disappear from /dev wait_for: @@ -25,7 +25,7 @@ state: absent timeout: 120 sleep: 5 - loop: "{{ (list(mdts.keys()) + list(osts.keys())) | unique }}" + loop: "{{ osts | combine(mdts) | list }}" - name: Wait for OST partition to disappear from /dev wait_for: @@ -33,4 +33,4 @@ state: absent timeout: 120 sleep: 5 - loop: "{{ (list(mdts.keys()) + list(osts.keys())) | unique }}" + loop: "{{ osts | combine(mdts) | list }}"