diff --git a/.cirun.yml b/.cirun.yml new file mode 100644 index 00000000..d23f28a8 --- /dev/null +++ b/.cirun.yml @@ -0,0 +1,16 @@ +runners: + - name: cirun-aws-runner + # Cloud Provider: AWS + cloud: aws + # Instance Type has 4 vcpu, 16 GiB memory, Up to 5 Gbps Network Performance + instance_type: t3.2xlarge + machine_image: ami-0eb199b995e2bc4e3 + # ami-0a388df278199ff52 + # Region: Oregon + region: us-west-2 + # Use Spot Instances for cost savings + preemptible: + - true + - false + labels: + - cirun-runner diff --git a/.github/scripts/extract_network_info.sh b/.github/scripts/extract_network_info.sh new file mode 100644 index 00000000..4cce4cf0 --- /dev/null +++ b/.github/scripts/extract_network_info.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +IFS=$'\n' # Split output into lines based on newline + +# Use `ip address show` instead of `ip -br -4 address show` for broader compatibility +readarray -t lines <<< "$(ip -br -4 address show | grep UP)" +for line in "${lines[@]}"; do + if [[ $line =~ (eth[0-9]|ens[0-9]+|enp[0-9].*) ]]; then + INTERFACE=$(echo $line | awk '{print $1}') + IP_RANGE=$(echo $line | awk '{print $3}') + break + fi +done + +# Write variables into network_info.txt +echo "Interface: $INTERFACE" > network_info.txt +echo "IP Range: $IP_RANGE" >> network_info.txt diff --git a/.github/scripts/gen_inventory.sh b/.github/scripts/gen_inventory.sh new file mode 100644 index 00000000..2d625478 --- /dev/null +++ b/.github/scripts/gen_inventory.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Check if the correct number of arguments was provided +if [ $# -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Get the hostname from the first argument +HOSTNAME=$1 + +# Get the output path from the second argument +OUTPUT_PATH=$2 + +# Ensure the directory exists +mkdir -p $(dirname "$OUTPUT_PATH") + +# Create the inventory.ini file at the specified output path with dynamic content +cat < "$OUTPUT_PATH" +${HOSTNAME} connection=local ansible_ssh_host=127.0.0.1 + +[hpc_master] +${HOSTNAME} + +[hpc_worker] +${HOSTNAME} +EOF + +echo "inventory.ini file has been created at $OUTPUT_PATH." diff --git a/.github/workflows/kvm-test.yaml b/.github/workflows/kvm-test.yaml index ea8fa2c8..7fe6e964 100644 --- a/.github/workflows/kvm-test.yaml +++ b/.github/workflows/kvm-test.yaml @@ -1,41 +1,148 @@ --- -name: Vagrant (KVM) Tests - -on: - pull_request: - push: - branches: - - main - -jobs: - # https://github.com/jonashackt/vagrant-github-actions - test-kvm: - name: KVM Test - runs-on: macos-latest - steps: - - uses: actions/checkout@v2 - - - name: Cache Vagrant boxes - uses: actions/cache@v2 - with: - path: ~/.vagrant.d/boxes - key: ${{ runner.os }}-vagrant-${{ hashFiles('Vagrantfile') }} - restore-keys: | - ${{ runner.os }}-vagrant- - - - name: Install test dependencies. - run: sudo pip3 install ansible - - - name: Install Ansible Dependencies - working-directory: tests/ubuntu2004-singlenode - run: | - ansible-galaxy collection install community.general - ansible-galaxy collection install ansible.posix - - - name: Show Vagrant version - run: vagrant --version - -# Disabled until we fix it -# - name: Run vagrant up -# working-directory: tests/ubuntu2004-singlenode -# run: vagrant up + name: Vagrant (KVM) Tests + + on: + pull_request: + push: + branches: + - main + + jobs: + test-kvm: + name: KVM Test + # Disable ci-run untill addressing /var/lib/dpkg/lock-frontend issue + runs-on: "cirun-runner--${{ github.run_id }}" + # runs-on: "ubuntu-latest" + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + cache: "pip" + + - name: Install dependencies + run: | + pip install ansible + + - name: Install Ansible dependencies + run: | + ansible-galaxy collection install -r requirements.yaml + + - name: Create deploy folder and move inventory files + run: | + mkdir deploy + cp -r inventory.template/* deploy/ + + chmod +x .github/scripts/gen_inventory.sh + ./.github/scripts/gen_inventory.sh $(hostname -s) deploy/inventory.ini + + - name: Check network adapter + run: | + ip a + + - name: Check hosts + run: | + cat /etc/hosts + + - name: Extract Network Information + run: | + chmod +x .github/scripts/extract_network_info.sh + ./.github/scripts/extract_network_info.sh + echo "adapter_name=$(cat network_info.txt | head -1 | awk '{print $2}')" >> $GITHUB_ENV + echo "ip_range=$(cat network_info.txt | awk 'NR > 1 && $3 {print $3}')" >> $GITHUB_ENV + + - name: Update group vars + run: | + cp deploy/group_vars/all.yaml deploy/group_vars/all.yaml.bak + + echo "Updating group vars for firewall and internal network" + echo "firewall_internal_ip_range: $ip_range" >> deploy/group_vars/all.yaml + echo "internal_interface: $adapter_name" >> deploy/group_vars/all.yaml + echo "SlurmConfigFileDIr: /etc/slurm" >> deploy/group_vars/all.yaml + + echo "Replace hpc01-test with $(hostname -s) in group_vars/hpc_worker.yaml file" + sed -i "s/hpc01-test/$(hostname -s)/g" deploy/group_vars/hpc_worker.yaml + + echo "Replace LDAP server URI with $(hostname -s) in group_vars/all.yaml file" + sed -i "s|ldap://hpc01-test:389|ldap://$(hostname -s):389|g" deploy/group_vars/all.yaml + + diff deploy/group_vars/all.yaml.bak deploy/group_vars/all.yaml || true + + - name: Disable unattended-upgrades + run: | + # Ensure all commands are non-interactive by setting DEBIAN_FRONTEND to noninteractive + export DEBIAN_FRONTEND=noninteractive + + # Check if unattended-upgrades service is active and stop it if it is + if systemctl is-active --quiet unattended-upgrades; then + echo "Stopping unattended-upgrades service..." + sudo systemctl stop unattended-upgrades + else + echo "unattended-upgrades service is not active. Skipping stop command." + fi + + # Proceed with killing any running APT processes without manual confirmation + echo "Checking and killing running APT processes if necessary..." + sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n +2 | while read PID; do + if [ ! -z "$PID" ]; then + echo "Killing PID $PID" + sudo kill -9 $PID + fi + done + + # Configure any packages that are in an unclean state non-interactively + echo "Configuring any packages in an unclean state..." + sudo dpkg --configure -a + + # Remove unattended-upgrades to avoid automatic background updates during script execution + echo "Disabling unattended upgrades..." + sudo apt-get remove --purge unattended-upgrades -y || true + + - name: List disk space + if: success() || failure() + run: | + df -h -l + + # Use tmate + - name: Setup tmate session + uses: mxschmitt/action-tmate@v3 + + - name: Move hpc variables from group_vars to temp location + run: | + mv deploy/group_vars/hpc_*.yaml /tmp + + # - name: Includes long demanding package installs + # run: | + # echo "mysql_enabled: true" >> deploy/group_vars/hpc_master.yaml + # echo "postgresql_enabled: true" >> deploy/group_vars/hpc_master.yaml + + - name: Run ansible playbook (partial) + run: | + cd deploy + ansible-playbook ../playbook.yaml -i inventory.ini --connection=local -vvv + env: + ANSIBLE_FORCE_COLOR: True + ANSIBLE_CALLBACKS_ENABLED: "profile_tasks" + + - name: Move hpc variables back to group_vars + run: | + mv /tmp/hpc_*.yaml deploy/group_vars/ + + - name: List disk space + if: success() || failure() + run: | + df -h -l + + - name: Run ansible playbook (Full) + run: | + cd deploy + ansible-playbook ../playbook.yaml -i inventory.ini --connection=local -vvv + env: + ANSIBLE_FORCE_COLOR: True + ANSIBLE_CALLBACKS_ENABLED: "profile_tasks" + + - name: List disk space + if: success() || failure() + run: | + df -h -l diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 8ad0e24e..160b8458 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -3,7 +3,17 @@ name: Ansible Lint on: push: + paths: + - 'roles/**' + - 'tasks/**' + - '.github/workflows/ansible-lint.yml' + pull_request: + paths: + - 'roles/**' + - 'tasks/**' + - '.github/workflows/ansible-lint.yml' + jobs: build: name: Ansible Lint diff --git a/roles/apt_packages/tasks/main.yml b/roles/apt_packages/tasks/main.yml index 74722cc9..3a00a5ef 100644 --- a/roles/apt_packages/tasks/main.yml +++ b/roles/apt_packages/tasks/main.yml @@ -1,6 +1,7 @@ --- - name: Ensure apt packages are installed become: true + timeout: 300 ansible.builtin.apt: name: "{{ installed_packages }}" state: latest diff --git a/roles/backups/tasks/backup.yaml b/roles/backups/tasks/backup.yaml index 49aab111..c6affbdf 100644 --- a/roles/backups/tasks/backup.yaml +++ b/roles/backups/tasks/backup.yaml @@ -1,6 +1,7 @@ --- - name: Ensure restic installed become: true + timeout: 300 ansible.builtin.apt: name: restic state: latest diff --git a/roles/cifs/handlers/main.yaml b/roles/cifs/handlers/main.yaml index fcd0b3cc..c66ca20a 100644 --- a/roles/cifs/handlers/main.yaml +++ b/roles/cifs/handlers/main.yaml @@ -5,6 +5,5 @@ name: "{{ item }}" enabled: "yes" state: restarted - cmd: "" with_items: - smbd diff --git a/roles/cifs/tasks/client.yaml b/roles/cifs/tasks/client.yaml index 5524ad08..1ef2a4ef 100644 --- a/roles/cifs/tasks/client.yaml +++ b/roles/cifs/tasks/client.yaml @@ -1,6 +1,7 @@ --- - name: Install cifs become: true + timeout: 300 ansible.builtin.apt: state: latest cache_valid_time: 3600 diff --git a/roles/cifs/tasks/server.yaml b/roles/cifs/tasks/server.yaml index 8427455c..0750f3c9 100644 --- a/roles/cifs/tasks/server.yaml +++ b/roles/cifs/tasks/server.yaml @@ -1,6 +1,7 @@ --- - name: Install samba become: true + timeout: 300 ansible.builtin.apt: state: latest cache_valid_time: 3600 @@ -22,4 +23,4 @@ owner: root group: root mode: "0644" - notify: restart services samba + notify: Restart services samba diff --git a/roles/dask_gateway/handlers/main.yaml b/roles/dask_gateway/handlers/main.yaml index 7c083370..9ecb4de8 100644 --- a/roles/dask_gateway/handlers/main.yaml +++ b/roles/dask_gateway/handlers/main.yaml @@ -5,6 +5,5 @@ name: "{{ item }}" enabled: "yes" state: restarted - cmd: "" with_items: - dask-gateway diff --git a/roles/dask_gateway/tasks/dask_gateway.yaml b/roles/dask_gateway/tasks/dask_gateway.yaml index fb364c30..6ce9110b 100644 --- a/roles/dask_gateway/tasks/dask_gateway.yaml +++ b/roles/dask_gateway/tasks/dask_gateway.yaml @@ -49,7 +49,7 @@ owner: dask group: dask mode: "0644" - notify: restart services dask-gateway + notify: Restart services dask-gateway - name: Copy the dask-gateway systemd service file become: true @@ -77,7 +77,7 @@ owner: root group: root mode: "0644" - notify: restart services dask-gateway + notify: Restart services dask-gateway - name: Ensure dask-gateway is enabled on boot become: true diff --git a/roles/grafana/tasks/grafana.yaml b/roles/grafana/tasks/grafana.yaml index 00214a52..b7291305 100644 --- a/roles/grafana/tasks/grafana.yaml +++ b/roles/grafana/tasks/grafana.yaml @@ -12,6 +12,7 @@ - name: Install grafana become: true + timeout: 300 ansible.builtin.apt: name: grafana{{ grafana_version }} state: "{% if grafana_version %}present{% else %}latest{% endif %}" diff --git a/roles/jupyterhub/handlers/main.yaml b/roles/jupyterhub/handlers/main.yaml index c27dcd6d..07a5fd09 100644 --- a/roles/jupyterhub/handlers/main.yaml +++ b/roles/jupyterhub/handlers/main.yaml @@ -5,7 +5,6 @@ name: "{{ item }}" enabled: "yes" state: restarted - cmd: "" with_items: - jupyterhub @@ -15,7 +14,6 @@ name: "{{ item }}" enabled: "yes" state: restarted - cmd: "" with_items: - jupyterhub-proxy @@ -25,6 +23,5 @@ name: "{{ item }}" enabled: "yes" state: restarted - cmd: "" with_items: - jupyterhub-ssh diff --git a/roles/keycloak/tasks/keycloak.yaml b/roles/keycloak/tasks/keycloak.yaml index 917991ae..5d242ac0 100644 --- a/roles/keycloak/tasks/keycloak.yaml +++ b/roles/keycloak/tasks/keycloak.yaml @@ -1,6 +1,7 @@ --- - name: Install openjdk and python requirements become: true + timeout: 300 ansible.builtin.apt: state: latest cache_valid_time: 3600 @@ -64,7 +65,8 @@ - name: Ensure Keycloak admin user exists become: true - ansible.builtin.command: /opt/keycloak-{{ keycloak_version }}/bin/add-user-keycloak.sh -r master -u "{{ keycloak_admin_username }}" -p "{{ keycloak_admin_password + ansible.builtin.command: + /opt/keycloak-{{ keycloak_version }}/bin/add-user-keycloak.sh -r master -u "{{ keycloak_admin_username }}" -p "{{ keycloak_admin_password }}" args: creates: /opt/keycloak-{{ keycloak_version }}/standalone/configuration/keycloak-add-user.json diff --git a/roles/mysql/defaults/main.yml b/roles/mysql/defaults/main.yml index 8931fe34..2446dffa 100644 --- a/roles/mysql/defaults/main.yml +++ b/roles/mysql/defaults/main.yml @@ -1,5 +1,7 @@ --- mysql_enabled: false +mysql_config_file: /etc/mysql/my.cnf + mysql_databases: - slurm - conda-store @@ -14,3 +16,10 @@ mysql_users: - username: conda-store password: eIbmUditL4RbQm0YPeLozRme privileges: "*.*:ALL" + +# Define a custom list of packages to install +mysql_packages: + - mysql-server + - mysql-common + +mysql_python_package: python3-mysqldb diff --git a/roles/mysql/handlers/main.yaml b/roles/mysql/handlers/main.yaml index 96781a56..a89efe3b 100644 --- a/roles/mysql/handlers/main.yaml +++ b/roles/mysql/handlers/main.yaml @@ -5,6 +5,5 @@ name: "{{ item }}" enabled: "yes" state: restarted - cmd: "" with_items: - mysql diff --git a/roles/mysql/tasks/mysql.yaml b/roles/mysql/tasks/mysql.yaml index 19f565f0..eeebca87 100644 --- a/roles/mysql/tasks/mysql.yaml +++ b/roles/mysql/tasks/mysql.yaml @@ -1,14 +1,33 @@ --- -- name: Install mysql +- name: Check if MySQL is already installed. + ansible.builtin.stat: + path: "{{ mysql_config_file }}" + register: mysql_installed + +- name: Update apt cache if MySQL is not yet installed. + ansible.builtin.apt: + update_cache: yes + changed_when: False + when: not mysql_installed.stat.exists + +- name: Ensure MySQL Python libraries are installed. + become: true + timeout: 300 + ansible.builtin.apt: + name: "{{ mysql_python_package }}" + state: present + +- name: Ensure MySQL packages are installed. become: true + timeout: 600 ansible.builtin.apt: - name: - - mysql-server - - python3 - - python3-pip - - python3-mysqldb - state: latest - cache_valid_time: 3600 + name: "{{ mysql_packages }}" + state: present + register: mysql_install_packages + +- name: Check if MySQL packages were installed. + ansible.builtin.set_fact: + mysql_install_packages: "{{ mysql_install_packages is defined and mysql_install_packages.changed }}" - name: Ensure mysql settings in file become: true @@ -17,20 +36,66 @@ section: mysqld option: "{{ item.key }}" value: "{{ item.value }}" - mode: "0644" + mode: "0655" backup: true with_dict: "{{ mysql_config }}" - notify: restart services mysql + notify: Restart services mysql -- name: Create mysql database +- name: Ensure MySQL is running and enabled become: true + ansible.builtin.service: + name: mysql + state: started + enabled: true + register: mysql_service + +- name: Check if the debian.cnf file exists + ansible.builtin.stat: + path: /etc/mysql/debian.cnf + register: debian_cnf_file + +- name: Grab the debian.cnf content + become: true + ansible.builtin.slurp: + src: /etc/mysql/debian.cnf + register: debian_cnf_content + when: debian_cnf_file.stat.exists + +- name: Set facts from debian.cnf content + set_fact: + debian_sys_maint_user: "debian-sys-maint" + debian_sys_maint_password: "{{ (debian_cnf_content['content'] | b64decode).split('\n') | select('match', '^password = ') | first | split('=') | last | trim }}" + when: debian_cnf_file.stat.exists + +- name: Create mysql databases using debian-sys-maint credentials community.mysql.mysql_db: + login_user: "{{ debian_sys_maint_user | default(omit) }}" + login_password: "{{ debian_sys_maint_password | default(omit) }}" name: "{{ item }}" state: present with_items: "{{ mysql_databases }}" + when: debian_cnf_file.stat.exists -- name: Create mysql users - become: true +- name: Create mysql databases as root (if debian.cnf does not exist) + community.mysql.mysql_db: + name: "{{ item }}" + state: present + with_items: "{{ mysql_databases }}" + when: not debian_cnf_file.stat.exists + +- name: Create mysql users using debian-sys-maint credentials + community.mysql.mysql_user: + login_user: "{{ debian_sys_maint_user | default(omit) }}" + login_password: "{{ debian_sys_maint_password | default(omit) }}" + name: "{{ item.username }}" + password: "{{ item.password }}" + priv: "{{ item.privileges }}" + state: present + with_items: "{{ mysql_users }}" + no_log: true # Avoid logging user creds + when: debian_cnf_file.stat.exists + +- name: Create mysql users as root (if debian.cnf does not exist) community.mysql.mysql_user: name: "{{ item.username }}" password: "{{ item.password }}" @@ -38,3 +103,4 @@ state: present with_items: "{{ mysql_users }}" no_log: true # Avoid logging user creds + when: not debian_cnf_file.stat.exists diff --git a/roles/nfs/tasks/client.yaml b/roles/nfs/tasks/client.yaml index ac6300d4..46e842b1 100644 --- a/roles/nfs/tasks/client.yaml +++ b/roles/nfs/tasks/client.yaml @@ -11,7 +11,7 @@ ansible.builtin.wait_for: host: "{{ item.host }}" port: 2049 - timeout: 600 + timeout: 600 # Set a timeout of 10 minutes with_items: "{{ nfs_client_mounts }}" - name: Ensure nfs mounted directories exist diff --git a/roles/nfs/tasks/server.yaml b/roles/nfs/tasks/server.yaml index 41b91418..d96cc02d 100644 --- a/roles/nfs/tasks/server.yaml +++ b/roles/nfs/tasks/server.yaml @@ -1,5 +1,6 @@ --- -- name: Install nfs +- name: Install nfs packages + timeout: 300 become: true ansible.builtin.apt: state: latest diff --git a/roles/openldap/handlers/main.yaml b/roles/openldap/handlers/main.yaml index bca5e5b4..ca5c23bf 100644 --- a/roles/openldap/handlers/main.yaml +++ b/roles/openldap/handlers/main.yaml @@ -6,7 +6,6 @@ name: "{{ item }}" enabled: "yes" state: restarted - cmd: "" with_items: - nscd - nslcd diff --git a/roles/openldap/tasks/client.yaml b/roles/openldap/tasks/client.yaml index 80503277..5e99d9f1 100644 --- a/roles/openldap/tasks/client.yaml +++ b/roles/openldap/tasks/client.yaml @@ -1,6 +1,7 @@ --- - name: Install ldap client packages become: true + timeout: 300 ansible.builtin.apt: name: - libpam-ldapd @@ -15,7 +16,7 @@ regexp: pam_mkhomedir\.so line: session required pam_mkhomedir.so skel=/etc/skel/ umask=0022 state: present - notify: restart services ldap + notify: Restart services ldap - name: LDAP Authentication | Query ldap in nsswitch.conf become: true @@ -28,7 +29,7 @@ - passwd - shadow - group - notify: restart services ldap + notify: Restart services ldap - name: LDAP Authentication | no cache for ldap in nscd.conf become: true @@ -40,7 +41,7 @@ with_items: - passwd - group - notify: restart services ldap + notify: Restart services ldap - name: LDAP Authentication | Configure /etc/nslcd.conf become: true @@ -48,4 +49,4 @@ src: nslcd.conf.j2 dest: /etc/nslcd.conf mode: "0600" - notify: restart services ldap + notify: Restart services ldap diff --git a/roles/openldap/tasks/openldap.yaml b/roles/openldap/tasks/openldap.yaml index 3719f3ce..b800a19f 100644 --- a/roles/openldap/tasks/openldap.yaml +++ b/roles/openldap/tasks/openldap.yaml @@ -1,6 +1,7 @@ --- - name: Install openldap packages become: true + timeout: 300 ansible.builtin.apt: name: - slapd diff --git a/roles/postgresql/handlers/main.yaml b/roles/postgresql/handlers/main.yaml index e59eec69..393c4b69 100644 --- a/roles/postgresql/handlers/main.yaml +++ b/roles/postgresql/handlers/main.yaml @@ -5,6 +5,5 @@ name: "{{ item }}" enabled: "yes" state: restarted - cmd: "" with_items: - postgresql diff --git a/roles/postgresql/tasks/postgresql.yaml b/roles/postgresql/tasks/postgresql.yaml index 6a4fe492..ddc9fe81 100644 --- a/roles/postgresql/tasks/postgresql.yaml +++ b/roles/postgresql/tasks/postgresql.yaml @@ -1,6 +1,7 @@ --- - name: Install PostgreSQL become: true + timeout: 600 ansible.builtin.apt: name: - postgresql @@ -11,6 +12,7 @@ cache_valid_time: 3600 - name: Ensure PostgreSQL service is running + become: true ansible.builtin.systemd: name: postgresql state: started diff --git a/roles/slurm/defaults/main.yml b/roles/slurm/defaults/main.yml index 5fa83f95..25a5363f 100644 --- a/roles/slurm/defaults/main.yml +++ b/roles/slurm/defaults/main.yml @@ -6,6 +6,8 @@ slurmd_enabled: false slurmctld_enabled: false slurmdbd_enabled: false +SlurmConfigFileDIr: /etc/slurm-llnl + slurm_config: ClusterName: cluster # slurmctld options diff --git a/roles/slurm/tasks/main.yaml b/roles/slurm/tasks/main.yaml index 091f8f11..8c613a43 100644 --- a/roles/slurm/tasks/main.yaml +++ b/roles/slurm/tasks/main.yaml @@ -5,6 +5,7 @@ - name: Install slurm client packages become: true + timeout: 300 ansible.builtin.apt: state: latest cache_valid_time: 3600 @@ -15,7 +16,7 @@ - name: Ensure that slurm configuration directory exists become: true ansible.builtin.file: - path: /etc/slurm + path: "{{ SlurmConfigFileDIr }}" state: directory mode: "0755" owner: root @@ -25,10 +26,10 @@ become: true ansible.builtin.template: src: templates/slurm.conf - dest: /etc/slurm/slurm.conf + dest: "{{ SlurmConfigFileDIr }}/slurm.conf" owner: root group: root - mode: "0444" + mode: "0755" register: _slurm_config - name: Install extra execution host configs @@ -39,7 +40,7 @@ ConstrainCores=yes ConstrainRAMSpace=yes ConstrainSwapSpace=yes - dest: /etc/slurm/cgroup.conf + dest: "{{ SlurmConfigFileDIr }}/cgroup.conf" owner: root group: root mode: "0444" diff --git a/roles/slurm/tasks/munge.yaml b/roles/slurm/tasks/munge.yaml index 63d89761..a8d69e40 100644 --- a/roles/slurm/tasks/munge.yaml +++ b/roles/slurm/tasks/munge.yaml @@ -38,6 +38,7 @@ - name: Install munge controller packages become: true + timeout: 300 ansible.builtin.apt: state: latest cache_valid_time: 3600 diff --git a/roles/slurm/tasks/slurm_exporter.yaml b/roles/slurm/tasks/slurm_exporter.yaml index dd595cc9..da14ae97 100644 --- a/roles/slurm/tasks/slurm_exporter.yaml +++ b/roles/slurm/tasks/slurm_exporter.yaml @@ -1,6 +1,7 @@ --- - name: Install golang ansible.builtin.include_tasks: golang.yaml + - name: Check that the slurm exporter binary exists ansible.builtin.stat: path: /usr/local/bin/prometheus_slurm_exporter diff --git a/roles/slurm/tasks/slurmctld.yaml b/roles/slurm/tasks/slurmctld.yaml index 0eae5e4a..b990f5f7 100644 --- a/roles/slurm/tasks/slurmctld.yaml +++ b/roles/slurm/tasks/slurmctld.yaml @@ -1,22 +1,28 @@ --- +# Must be writable by user SlurmUser. +# The file must be accessible by the primary and backup control machines. - name: Ensure slurm state directory exists become: true ansible.builtin.file: path: "{{ slurm_config.StateSaveLocation }}" state: directory - mode: "0700" + mode: "0755" owner: slurm group: slurm +# Must be writable by user SlurmUser. +# The file must be accessible by the primary and backup control machines. - name: Ensure slurm log directory exists become: true ansible.builtin.file: path: "{{ slurm_config.SlurmctldLogFile | dirname }}" state: directory - mode: "0700" + mode: "0755" owner: slurm group: slurm +# Must be writable by user root. Preferably writable and removable by SlurmUser. +# The file must be accessible by the primary and backup control machines. - name: Ensure slurm pid directory exists become: true ansible.builtin.file: @@ -33,7 +39,7 @@ [Unit] Description=Slurm controller daemon After=network.target munge.service - ConditionPathExists=/etc/slurm/slurm.conf + ConditionPathExists={{ SlurmConfigFileDIr }}/slurm.conf [Service] Type=forking @@ -54,6 +60,7 @@ - name: Install slurm controller packages become: true + timeout: 300 ansible.builtin.apt: state: latest cache_valid_time: 3600 diff --git a/roles/slurm/tasks/slurmd.yaml b/roles/slurm/tasks/slurmd.yaml index 7105c146..b44ec6b0 100644 --- a/roles/slurm/tasks/slurmd.yaml +++ b/roles/slurm/tasks/slurmd.yaml @@ -1,4 +1,6 @@ --- +# Permissions must be set to 755 so that job scripts can be executed from this directory. +# A distinct file must exist on each compute node. - name: Create slurm spool directory become: true ansible.builtin.file: @@ -8,6 +10,8 @@ mode: "0755" state: directory +# Must be writable by user root. +# A distinct file must exist on each compute node. - name: Create slurm log directory become: true ansible.builtin.file: @@ -17,6 +21,8 @@ mode: "0755" state: directory +# Must be writable by user root. +# A distinct file must exist on each compute node. - name: Ensure slurm pid directory exists become: true ansible.builtin.file: @@ -33,7 +39,7 @@ [Unit] Description=Slurm node daemon After=network.target munge.service remote-fs.target - ConditionPathExists=/etc/slurm/slurm.conf + ConditionPathExists={{ SlurmConfigFileDIr }}/slurm.conf [Service] Type=forking diff --git a/roles/slurm/tasks/slurmdbd.yaml b/roles/slurm/tasks/slurmdbd.yaml index 3cf1f7e7..14aba571 100644 --- a/roles/slurm/tasks/slurmdbd.yaml +++ b/roles/slurm/tasks/slurmdbd.yaml @@ -1,13 +1,15 @@ --- +# Must be writable by user SlurmUser. - name: Ensure slurmdbd log directory exists become: true ansible.builtin.file: path: "{{ slurmdbd_config.LogFile | dirname }}" state: directory - mode: "0700" + mode: "0755" owner: slurm group: slurm +# Must be writable by user SlurmUser. - name: Ensure slurm pid directory exists become: true ansible.builtin.file: @@ -17,11 +19,15 @@ owner: slurm group: slurm +# This file should be only on the computer where SlurmDBD executes +# and should only be readable by the user which executes SlurmDBD (e.g. "slurm"). +# This file should be protected from unauthorized access since +# it contains a database password - name: Install slurmdbd.conf become: true ansible.builtin.template: src: templates/slurmdbd.conf - dest: /etc/slurm/slurmdbd.conf + dest: "{{ SlurmConfigFileDIr }}/slurmdbd.conf" owner: slurm group: slurm mode: "0600" @@ -34,7 +40,7 @@ [Unit] Description=Slurm DBD accounting daemon After=network.target munge.service - ConditionPathExists=/etc/slurm/slurmdbd.conf + ConditionPathExists={{ SlurmConfigFileDIr }}/slurmdbd.conf [Service] Type=forking @@ -55,6 +61,7 @@ - name: Install slurm controller packages become: true + timeout: 300 ansible.builtin.apt: state: latest cache_valid_time: 3600 diff --git a/roles/traefik/handlers/main.yaml b/roles/traefik/handlers/main.yaml index c52eac80..2ff770f8 100644 --- a/roles/traefik/handlers/main.yaml +++ b/roles/traefik/handlers/main.yaml @@ -5,6 +5,5 @@ name: "{{ item }}" enabled: "yes" state: restarted - cmd: "" with_items: - traefik diff --git a/roles/traefik/tasks/traefik.yaml b/roles/traefik/tasks/traefik.yaml index 48329849..b3e303ab 100644 --- a/roles/traefik/tasks/traefik.yaml +++ b/roles/traefik/tasks/traefik.yaml @@ -89,7 +89,7 @@ owner: traefik group: traefik when: traefik_tls_certificate is defined - notify: restart services traefik + notify: Restart services traefik register: _traefik_tls_certificate - name: Copy TLS key if provided @@ -102,7 +102,7 @@ owner: traefik group: traefik when: traefik_tls_key is defined - notify: restart services traefik + notify: Restart services traefik register: _traefik_tls_key - name: Copy traefik configuration @@ -113,7 +113,7 @@ mode: "0600" owner: traefik group: traefik - notify: restart services traefik + notify: Restart services traefik - name: Copy traefik dynamic configuration become: true @@ -123,7 +123,7 @@ mode: "0600" owner: traefik group: traefik - notify: restart services traefik + notify: Restart services traefik - name: Copy the traefik systemd service file become: true @@ -155,7 +155,7 @@ owner: root group: root mode: "0644" - notify: restart services traefik + notify: Restart services traefik - name: Ensure Traefik is enabled on boot become: true