diff --git a/experimental/lustre_ipoib/config.json b/experimental/lustre_ipoib/config.json new file mode 100644 index 000000000..be7c23449 --- /dev/null +++ b/experimental/lustre_ipoib/config.json @@ -0,0 +1,221 @@ +{ + "location": "variables.location", + "resource_group": "variables.resource_group", + "install_from": "headnode", + "admin_user": "hpcadmin", + "vnet": { + "name": "hpcvnet", + "address_prefix": "10.2.0.0/20", + "subnets": { + "compute": "10.2.0.0/22", + "storage": "10.2.4.0/24" + } + }, + "variables": { + "location": "", + "resource_group": "", + "image": "OpenLogic:CentOS-HPC:7.6:latest", + "lustreimage": "OpenLogic:CentOS-HPC:7.6:latest", + "drivenum": 4, + "ossnum": 4, + "low_priority": true, + "storage_account": "", + "storage_key": "sakey.{{variables.storage_account}}", + "storage_container": "", + "log_analytics_lfs_name": "", + "la_resourcegroup": "", + "la_name": "", + "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "lustre_version": "2.10", + "lustre_mount": "/lustre" + }, + "resources": { + "headnode": { + "type": "vm", + "vm_type": "Standard_HC44rs", + "accelerated_networking": false, + "public_ip": true, + "image": "variables.image", + "subnet": "compute", + "tags": [ + "disable-selinux", + "cndefault", + "localuser", + "pbsserver", + "loginnode", + "nfsserver" + ] + }, + "lustre": { + "type": "vmss", + "vm_type": "Standard_HC44rs", + "instances": "9", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lustre[0:5]", + "osses[1:5]", + "lfsrepo", + "lfsclient[5:9]", + "localuser", + "pbsclient[5:9]", + "nfsclient", + "disable-selinux", + "lfsloganalytics" + ] + } + }, + "install": [ + { + "script": "disable-selinux.sh", + "tag": "disable-selinux", + "sudo": true + }, + { + "script": "cndefault.sh", + "tag": "cndefault", + "sudo": true + }, + { + "script": "nfsserver.sh", + "tag": "nfsserver", + "sudo": true + }, + { + "script": "nfsclient.sh", + "args": [ + "$( Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. + +The configuration file requires the following variables to be set: + +| Variable | Description | +|-------------------------|----------------------------------------------| +| resource_group | The resource group for the project | +| storage_account | The storage account for HSM | +| storage_key | The storage key for HSM | +| storage_container | The container to use for HSM | +| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | +| la_resourcegroup | The resource group for Log Analytics | +| la_name | The Log Analytics Workspace name | + +> Note: you can remove log anaytics and/or HSM from the config file if not required. + +> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/experimental/lustre_ipoib/scripts/installdrives.sh b/experimental/lustre_ipoib/scripts/installdrives.sh new file mode 100755 index 000000000..221e2349e --- /dev/null +++ b/experimental/lustre_ipoib/scripts/installdrives.sh @@ -0,0 +1,33 @@ +#!/bin/bash +groupname=$1 +vmlist=$2 +ossnum=$3 +drivenum=$4 + +#create the drives first before attachint to vmss +drivecount=$(($drivenum*$ossnum)) + +for ((num=1; num<=$drivecount; num++)); do + az disk create -g $groupname -n "lustredrive$num" --size-gb 1024 & +done + +sleep 60 # to ensure all drives are made + +#Now use the created drives +index=0 +lustrecnt=1 + +idlisttmp=$(az vmss list-instances --resource-group $groupname --name lustre |grep providers/Microsoft.Compute/virtualMachineScaleSets/lustre/virtualMachines | awk -F "virtualMachines/" '{print $2}' | sed '/networkInterfaces/d'| sed 's/["].*$//') + +idlist=($idlisttmp) + +for vmname in ${vmlist[@]}; do + ((index++)) + if [ $index -gt 0 ] ; then + for ((diskid=1; diskid<=$drivenum; diskid++)); do + az vmss disk attach --vmss-name lustre --disk lustredrive${lustrecnt} --sku Premium_LRS --instance-id ${idlist[$index]} --resource-group $groupname + ((lustrecnt++)) + done + fi +done + diff --git a/experimental/lustre_ipoib/scripts/lfsclient.sh b/experimental/lustre_ipoib/scripts/lfsclient.sh new file mode 100755 index 000000000..4e30d37fa --- /dev/null +++ b/experimental/lustre_ipoib/scripts/lfsclient.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = mount point (default: /lustre) +master=$1 +lfs_mount=${2:-/lustre} +mkdir ~/.ssh + +cp -r /share/home/hpcuser/.ssh ~/ + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + +if rpm -q lustre; then + + # if the server packages are installed only the client kmod is needed + # for 2.10 and nothing extra is needed for 2.12 + if [ "$lustre_version" = "2.10" ]; then + + if ! rpm -q kmod-lustre-client; then + yum -y install kmod-lustre-client + fi + + fi + +else + + # install the client RPMs if not already installed + if ! rpm -q lustre-client kmod-lustre-client; then + yum -y install lustre-client kmod-lustre-client + fi + weak-modules --add-kernel $(uname -r) + +fi +#Include the correct infiniband options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +chmod 777 $lfs_mount diff --git a/experimental/lustre_ipoib/scripts/lfsmaster.sh b/experimental/lustre_ipoib/scripts/lfsmaster.sh new file mode 100755 index 000000000..dce36a159 --- /dev/null +++ b/experimental/lustre_ipoib/scripts/lfsmaster.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +device=$1 + +# this will only install MDS on first node in a scaleset +echo "pssh_nodenum is $PSSH_NODENUM" + +cp -r /share/home/hpcuser/.ssh ~/ + +#Include the correct ipoib options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab + mount -a + + # set up hsm + lctl set_param -P mdt.*-MDT0000.hsm_control=enabled + lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 + lctl set_param mdt.*-MDT0000.hsm.max_requests=128 + + # allow any user and group ids to write + lctl set_param mdt.*-MDT0000.identity_upcall=NONE + +fi diff --git a/experimental/lustre_ipoib/scripts/lfsoss.sh b/experimental/lustre_ipoib/scripts/lfsoss.sh new file mode 100755 index 000000000..0b9b060a5 --- /dev/null +++ b/experimental/lustre_ipoib/scripts/lfsoss.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# arg: $1 = lfsmaster +# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +master=$1 +device=$2 + +cp -r /share/home/hpcuser/.ssh ~/ + +index=$(($PSSH_NODENUM + 1)) +myuser="hpcuser" + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + +if [ "$PSSH_NODENUM" != "0" ]; then + + mkfs.lustre \ + --fsname=LustreFS \ + --backfstype=ldiskfs \ + --reformat \ + --ost \ + --mgsnode="${masterib}" \ + --index=$index \ + --mountfsoptions="errors=remount-ro" \ + $device +#Include the correct ipoib options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +fi diff --git a/experimental/lustre_ipoib/scripts/lfspkgs.sh b/experimental/lustre_ipoib/scripts/lfspkgs.sh new file mode 100755 index 000000000..3120d3ba6 --- /dev/null +++ b/experimental/lustre_ipoib/scripts/lfspkgs.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +yum -y install lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre-resource-agents e2fsprogs || exit 1 + +sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf + +systemctl restart waagent + +weak-modules --add-kernel --no-initramfs + +umount /mnt/resource diff --git a/experimental/lustre_ipoib_nvmedrives/config.json b/experimental/lustre_ipoib_nvmedrives/config.json new file mode 100644 index 000000000..ae8fc7902 --- /dev/null +++ b/experimental/lustre_ipoib_nvmedrives/config.json @@ -0,0 +1,202 @@ +{ + "location": "variables.location", + "resource_group": "variables.resource_group", + "install_from": "headnode", + "admin_user": "hpcadmin", + "vnet": { + "name": "hpcvnet", + "address_prefix": "10.2.0.0/20", + "subnets": { + "compute": "10.2.0.0/22", + "storage": "10.2.4.0/24" + } + }, + "variables": { + "location" : "", + "resource_group": "", + "image": "OpenLogic:CentOS-HPC:7.6:latest", + "lustreimage": "OpenLogic:CentOS-HPC:7.6:latest", + "drivenum": 4, + "ossnum": 4, + "low_priority": true, + "storage_account": "", + "storage_key": "sakey.{{variables.storage_account}}", + "storage_container": "", + "log_analytics_lfs_name": "", + "la_resourcegroup": "", + "la_name": "", + "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "lustre_version": "2.10", + "lustre_mount": "/lustre" + }, + "resources": { + "headnode": { + "type": "vm", + "vm_type": "Standard_HB60rs", + "accelerated_networking": false, + "public_ip": true, + "image": "variables.image", + "subnet": "compute", + "tags": [ + "disable-selinux", + "cndefault", + "localuser", + "pbsserver", + "loginnode", + "nfsserver" + ] + }, + "lustre": { + "type": "vmss", + "vm_type": "Standard_HB120rs_v2", + "instances": "9", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lustre[0:5]", + "osses[1:5]", + "lfsrepo", + "lfsclient[5:9]", + "localuser", + "pbsclient[5:9]", + "nfsclient", + "disable-selinux", + "lfsloganalytics" + ] + } + }, + "install": [ + { + "script": "disable-selinux.sh", + "tag": "disable-selinux", + "sudo": true + }, + { + "script": "cndefault.sh", + "tag": "cndefault", + "sudo": true + }, + { + "script": "nfsserver.sh", + "tag": "nfsserver", + "sudo": true + }, + { + "script": "nfsclient.sh", + "args": [ + "$( Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. + +The configuration file requires the following variables to be set: + +| Variable | Description | +|-------------------------|----------------------------------------------| +| resource_group | The resource group for the project | +| storage_account | The storage account for HSM | +| storage_key | The storage key for HSM | +| storage_container | The container to use for HSM | +| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | +| la_resourcegroup | The resource group for Log Analytics | +| la_name | The Log Analytics Workspace name | + +> Note: you can remove log anaytics and/or HSM from the config file if not required. + +> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/experimental/lustre_ipoib_nvmedrives/scripts/lfsclient.sh b/experimental/lustre_ipoib_nvmedrives/scripts/lfsclient.sh new file mode 100755 index 000000000..4e30d37fa --- /dev/null +++ b/experimental/lustre_ipoib_nvmedrives/scripts/lfsclient.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = mount point (default: /lustre) +master=$1 +lfs_mount=${2:-/lustre} +mkdir ~/.ssh + +cp -r /share/home/hpcuser/.ssh ~/ + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + +if rpm -q lustre; then + + # if the server packages are installed only the client kmod is needed + # for 2.10 and nothing extra is needed for 2.12 + if [ "$lustre_version" = "2.10" ]; then + + if ! rpm -q kmod-lustre-client; then + yum -y install kmod-lustre-client + fi + + fi + +else + + # install the client RPMs if not already installed + if ! rpm -q lustre-client kmod-lustre-client; then + yum -y install lustre-client kmod-lustre-client + fi + weak-modules --add-kernel $(uname -r) + +fi +#Include the correct infiniband options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +chmod 777 $lfs_mount diff --git a/experimental/lustre_ipoib_nvmedrives/scripts/lfsmaster.sh b/experimental/lustre_ipoib_nvmedrives/scripts/lfsmaster.sh new file mode 100755 index 000000000..dce36a159 --- /dev/null +++ b/experimental/lustre_ipoib_nvmedrives/scripts/lfsmaster.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +device=$1 + +# this will only install MDS on first node in a scaleset +echo "pssh_nodenum is $PSSH_NODENUM" + +cp -r /share/home/hpcuser/.ssh ~/ + +#Include the correct ipoib options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab + mount -a + + # set up hsm + lctl set_param -P mdt.*-MDT0000.hsm_control=enabled + lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 + lctl set_param mdt.*-MDT0000.hsm.max_requests=128 + + # allow any user and group ids to write + lctl set_param mdt.*-MDT0000.identity_upcall=NONE + +fi diff --git a/experimental/lustre_ipoib_nvmedrives/scripts/lfsoss.sh b/experimental/lustre_ipoib_nvmedrives/scripts/lfsoss.sh new file mode 100755 index 000000000..0b9b060a5 --- /dev/null +++ b/experimental/lustre_ipoib_nvmedrives/scripts/lfsoss.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# arg: $1 = lfsmaster +# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +master=$1 +device=$2 + +cp -r /share/home/hpcuser/.ssh ~/ + +index=$(($PSSH_NODENUM + 1)) +myuser="hpcuser" + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + +if [ "$PSSH_NODENUM" != "0" ]; then + + mkfs.lustre \ + --fsname=LustreFS \ + --backfstype=ldiskfs \ + --reformat \ + --ost \ + --mgsnode="${masterib}" \ + --index=$index \ + --mountfsoptions="errors=remount-ro" \ + $device +#Include the correct ipoib options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +fi diff --git a/experimental/lustre_ipoib_nvmedrives/scripts/lfspkgs.sh b/experimental/lustre_ipoib_nvmedrives/scripts/lfspkgs.sh new file mode 100755 index 000000000..3120d3ba6 --- /dev/null +++ b/experimental/lustre_ipoib_nvmedrives/scripts/lfspkgs.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +yum -y install lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre-resource-agents e2fsprogs || exit 1 + +sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf + +systemctl restart waagent + +weak-modules --add-kernel --no-initramfs + +umount /mnt/resource diff --git a/experimental/lustre_ipoib_nvmedrives/scripts/waitforreboot.sh b/experimental/lustre_ipoib_nvmedrives/scripts/waitforreboot.sh new file mode 100755 index 000000000..753167b8f --- /dev/null +++ b/experimental/lustre_ipoib_nvmedrives/scripts/waitforreboot.sh @@ -0,0 +1,2 @@ +#!/bin/bash +sleep 60 #enough time for node reboot to continue process diff --git a/experimental/lustre_rdma_avs/config.json b/experimental/lustre_rdma_avs/config.json new file mode 100644 index 000000000..e418c67ae --- /dev/null +++ b/experimental/lustre_rdma_avs/config.json @@ -0,0 +1,274 @@ +{ + "location": "variables.location", + "resource_group": "variables.resource_group", + "install_from": "headnode", + "admin_user": "hpcadmin", + "vnet": { + "name": "hpcvnet", + "address_prefix": "10.2.0.0/20", + "subnets": { + "compute": "10.2.0.0/22", + "storage": "10.2.4.0/24" + } + }, + "variables": { + "location": "", + "resource_group": "", + "lustreimage": "OpenLogic:CentOS:7.7:7.7.2020042900", + "hpcimage": "OpenLogic:CentOS:7.7:7.7.2020042900", + "compute_instances": 2, + "lustre_instances": 2, + "low_priority": false, + "storage_account": "", + "storage_key": "sakey.{{variables.storage_account}}", + "storage_container": "", + "log_analytics_lfs_name": "", + "la_resourcegroup": "", + "la_name": "", + "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "lustre_version": "2.12.4", + "lustre_avset": "{{variables.resource_group}}avset", + "lustre_mount": "/lustre" + }, + "resources": { + "headnode": { + "type": "vm", + "vm_type": "Standard_D8s_v3", + "accelerated_networking": false, + "public_ip": true, + "image": "variables.hpcimage", + "subnet": "compute", + "tags": [ + "disable-selinux", + "cndefault", + "localuser", + "pbsserver", + "loginnode", + "rebootlustre", + "nfsserver", + "allnodes" + ] + }, + "compute": { + "type": "vm", + "vm_type": "Standard_HB120rs_v2", + "instances": "variables.compute_instances", + "availability_set": "variables.lustre_avset", + "low_priority": "variables.low_priority", + "accelerated_networking": false, + "image": "variables.hpcimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lfsrepo", + "lfsclient", + "localuser", + "pbsclient", + "nfsclient", + "disable-selinux", + "allnodes" + ] + }, + "lfsmaster": { + "type": "vm", + "vm_type": "Standard_HB120rs_v2", + "availability_set": "variables.lustre_avset", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lustre", + "lfsmaster", + "lfsrepo", + "localuser", + "nfsclient", + "disable-selinux", + "lfsloganalytics", + "allnodes" + ] + }, + "lustre": { + "type": "vm", + "vm_type": "Standard_HB120rs_v2", + "instances": "variables.lustre_instances", + "availability_set": "variables.lustre_avset", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lfsrepo", + "localuser", + "nfsclient", + "lustre", + "ossnode", + "disable-selinux", + "lfsloganalytics", + "allnodes" + ] + } + }, + "install": [ + { + "script": "disable-selinux.sh", + "tag": "disable-selinux", + "sudo": true + }, + { + "script": "cndefault.sh", + "tag": "cndefault", + "sudo": true + }, + { + "script": "nfsserver.sh", + "tag": "nfsserver", + "sudo": true + }, + { + "script": "nfsclient.sh", + "args": [ + "$(>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + # configuration file to use + azhpc.config: experimental/lustre_rdma_avs/config.json + # pipeline directory + azhpc.pipeline_dir: experimental/lustre_rdma_avs + # destination of scripts. Default is hpcuser@headnode:/apps + #azhpc.script_remote_dest: 'hpcadmin@headnode:.' + +# Add the variables needed in your configuration file +# Uncomment and set values below, or leave commented and thru pipeline variables + # azhpc.variables.location: westeurope + azhpc.variables.compute_instances: 4 + azhpc.variables.low_priority: false + azhpc.variables.lustre_instances: 4 + azhpc.variables.log_analytics_lfs_name: lfs + azhpc.variables.lustre_mount: /lustre + + +# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +jobs: +- template: ../../ci/templates/jobs/build.yml + parameters: + extra_steps: ../../../experimental/lustre_rdma_avs/test.yml diff --git a/experimental/lustre_rdma_avs/readme.md b/experimental/lustre_rdma_avs/readme.md new file mode 100644 index 000000000..3b55df48c --- /dev/null +++ b/experimental/lustre_rdma_avs/readme.md @@ -0,0 +1,36 @@ +# lustre_rdma_avs + +Visualisation: [config.json](https://azurehpc.azureedge.net/?o=https://raw.githubusercontent.com/Azure/azurehpc/master/examples/lustre_Infiniband/config.json) + +This is a deployment of Lustre using the available infiniband network. This solution has been designed to work with true Remote Direct Memory Access(RDMA) . + +This deployment will only function using the Python based AzureHPC (not the BASH libexec). + +Resources: + +* Head node (headnode) +* Compute nodes (compute) +* Lustre + * Management/Meta-data server (lfsmaster) + * Object storage servers (lustre) + * Hierarchical storage management nodes (lfshsm) + * Lustre client exporting with samba (lfssmb) + +> Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. + +The configuration file requires the following variables to be set: + +| Variable | Description | +|-------------------------|----------------------------------------------| +| location | The locaton of the project | +| resource_group | The resource group for the project | +| storage_account | The storage account for HSM | +| storage_key | The storage key for HSM | +| storage_container | The container to use for HSM | +| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | +| la_resourcegroup | The resource group for Log Analytics | +| la_name | The Log Analytics Workspace name | + +> Note: you can remove log anaytics and/or HSM from the config file if not required. + +> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/experimental/lustre_rdma_avs/scripts/installOFED.sh b/experimental/lustre_rdma_avs/scripts/installOFED.sh new file mode 100755 index 000000000..c267519fc --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/installOFED.sh @@ -0,0 +1,4 @@ +#!/bin/bash +yum -y groupinstall --skip-broken "Infiniband Support" 2>/dev/null +echo "done installing Infiniband" +exit 0 diff --git a/experimental/lustre_rdma_avs/scripts/lfsclient.sh b/experimental/lustre_rdma_avs/scripts/lfsclient.sh new file mode 100755 index 000000000..5fc8f9a62 --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/lfsclient.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = mount point (default: /lustre) +master=$1 +lfs_mount=${2:-/lustre} + +cp -r /share/home/hpcuser/.ssh /root/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +chmod 777 $lfs_mount diff --git a/experimental/lustre_rdma_avs/scripts/lfsmaster.sh b/experimental/lustre_rdma_avs/scripts/lfsmaster.sh new file mode 100755 index 000000000..d2dcdb02e --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/lfsmaster.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +device=$1 + +# this will only install MDS on first node in a scaleset +echo "pssh_nodenum is $PSSH_NODENUM" + +cp -r /share/home/hpcuser/.ssh /root/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab + mount -a + + # set up hsm + lctl set_param -P mdt.*-MDT0000.hsm_control=enabled + lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 + lctl set_param mdt.*-MDT0000.hsm.max_requests=128 + + # allow any user and group ids to write + lctl set_param mdt.*-MDT0000.identity_upcall=NONE + + diff --git a/experimental/lustre_rdma_avs/scripts/lfsoss.sh b/experimental/lustre_rdma_avs/scripts/lfsoss.sh new file mode 100755 index 000000000..8f39aac68 --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/lfsoss.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# arg: $1 = lfsmaster +# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +master=$1 +device=$2 + +cp -r /share/home/hpcuser/.ssh /root/ + +index=$(($PSSH_NODENUM + 1)) +myuser="hpcuser" + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + + lnetctl net add --net o2ib --if ib0 #double check + mkfs.lustre \ + --fsname=LustreFS \ + --backfstype=ldiskfs \ + --reformat \ + --ost \ + --mgsnode="${masterib}@o2ib" \ + --index=$index \ + --mountfsoptions="errors=remount-ro" \ + $device + + +mkdir /mnt/oss +echo "$device /mnt/oss lustre noatime,nodiratime,nobarrier 0 2" >> /etc/fstab +mount -a diff --git a/experimental/lustre_rdma_avs/scripts/lfsrepo.sh b/experimental/lustre_rdma_avs/scripts/lfsrepo.sh new file mode 100755 index 000000000..00e0d9dd9 --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/lfsrepo.sh @@ -0,0 +1,33 @@ +#!/bin/bash +lustre_version=${1-2.10} + +if [ "$lustre_version" = "2.10" -o "$lustre_version" = "2.12" ]; then + lustre_dir=latest-${lustre_version}-release + else + lustre_dir="lustre-$lustre_version" +fi + +cat << EOF >/etc/yum.repos.d/LustrePack.repo +[lustreserver] +name=lustreserver +baseurl=https://downloads.whamcloud.com/public/lustre/${lustre_dir}/el7/server/ +enabled=1 +gpgcheck=0 + +[e2fs] +name=e2fs +baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7/ +enabled=1 +gpgcheck=0 + +[lustreclient] +name=lustreclient +baseurl=https://downloads.whamcloud.com/public/lustre/${lustre_dir}/el7/client/ +enabled=1 +gpgcheck=0 +EOF + +#Include the correct rdma options +#cat >/etc/modprobe.d/lustre.conf</dev/null + diff --git a/experimental/lustre_rdma_avs/scripts/lustreinstall2.sh b/experimental/lustre_rdma_avs/scripts/lustreinstall2.sh new file mode 100755 index 000000000..60f3e759e --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/lustreinstall2.sh @@ -0,0 +1,10 @@ +#!/bin/bash +yum -y --nogpgcheck --enablerepo=lustreserver install kmod-lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre lustre-resource-agents +modprobe -v lustre + +sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf + +weak-modules --add-kernel --no-initramfs +systemctl enable lustre +umount /mnt/resource diff --git a/experimental/lustre_rdma_avs/scripts/lustrenetwork.sh b/experimental/lustre_rdma_avs/scripts/lustrenetwork.sh new file mode 100755 index 000000000..f95d33864 --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/lustrenetwork.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf +service waagent restart +service rdma start +modprobe lnet +lctl network configure +lnetctl net add --net o2ib --if ib0 #need this to come up every time +sleep 5 diff --git a/experimental/lustre_rdma_avs/scripts/rebootlustre.sh b/experimental/lustre_rdma_avs/scripts/rebootlustre.sh new file mode 100755 index 000000000..2d33c180b --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/rebootlustre.sh @@ -0,0 +1,16 @@ +#!/bin/bash +vmlist=$1 +osscount=$2 +totalcount=$((osscount+2)) +index=0 +#prep headnode +cp -r /share/home/hpcuser/.ssh /root/ +echo "vmlist is ${vmlist[@]}" + +#needs to be done sequentially +for vmname in ${vmlist[@]}; do + echo "Rebooting $vmname" + ssh hpcuser@${vmname} "sudo reboot 2>/dev/null; exit 2>/dev/null" 2>/dev/null + index=$((index+1)) +done +exit 0 diff --git a/experimental/lustre_rdma_avs/test.yml b/experimental/lustre_rdma_avs/test.yml new file mode 100644 index 000000000..b5d22d3b3 --- /dev/null +++ b/experimental/lustre_rdma_avs/test.yml @@ -0,0 +1,16 @@ +steps: +- template: ../../ci/templates/steps/azhpc-run.yml + parameters: + user: hpcuser + command: /apps/ci/check_pbs_nodes.sh $(azhpc.variables.compute_instances) + +- template: ../../ci/templates/steps/azhpc-run.yml + parameters: + user: hpcadmin + command: /apps/ci/check_mountpoints.sh $(azhpc.variables.lustre_mount) + +- template: ../../ci/templates/steps/azhpc-run.yml + parameters: + user: hpcadmin + command: /apps/ci/check_lustre_client.sh $(azhpc.variables.lustre_mount) + diff --git a/experimental/lustre_rdma_avs/writeup b/experimental/lustre_rdma_avs/writeup new file mode 100644 index 000000000..93562dd26 --- /dev/null +++ b/experimental/lustre_rdma_avs/writeup @@ -0,0 +1,17 @@ +- lustre-ipoib - This is a created implementation of Lustre using ip over infiniband (IPoIB) +- lustre-rdma - This is a created implementation of Lustre using native Remote Direct Memory Access (RDMA) + +Changes to files to enable Infiniband functionality: +lfsmaster.sh +lfsoss.sh +lfsclient.sh +lfsrepo.sh +lfspkgs.sh + +Addition for the installation of new Mellanox OFED (MOFED) for the Lustre kernel : installMOFED.sh + +Additions for correct Lustre kernel : +lustreinstall1.sh +lustreinstall2.sh + +Addition for pause after MDS/OSS reboot : waitforreboot.sh diff --git a/experimental/lustre_rdma_nvmedrives/config.json b/experimental/lustre_rdma_nvmedrives/config.json new file mode 100644 index 000000000..d3fe8b3ba --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/config.json @@ -0,0 +1,234 @@ +{ + "location": "variables.location", + "resource_group": "variables.resource_group", + "install_from": "headnode", + "admin_user": "hpcadmin", + "vnet": { + "name": "hpcvnet", + "address_prefix": "10.2.0.0/20", + "subnets": { + "compute": "10.2.0.0/22", + "storage": "10.2.4.0/24" + } + }, + "variables": { + "location": "", + "resource_group": "", + "image": "OpenLogic:CentOS:7.6:latest", + "lustreimage": "OpenLogic:CentOS:7.6:latest", + "ossnum": 4, + "low_priority": true, + "storage_account": "", + "storage_key": "sakey.{{variables.storage_account}}", + "storage_container": "", + "log_analytics_lfs_name": "", + "la_resourcegroup": "", + "la_name": "", + "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "lustre_version": "2.10", + "lustre_mount": "/lustre" + }, + "resources": { + "headnode": { + "type": "vm", + "vm_type": "Standard_HB60rs", + "accelerated_networking": false, + "public_ip": true, + "image": "variables.image", + "subnet": "compute", + "tags": [ + "disable-selinux", + "cndefault", + "rebootlustre", + "localuser", + "pbsserver", + "allnodes", + "loginnode", + "nfsserver" + ] + }, + "lustre": { + "type": "vmss", + "vm_type": "Standard_HB120rs_v2", + "instances": "9", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lustre[0:5]", + "osses[1:5]", + "lfsrepo", + "lfsclient[5:9]", + "localuser", + "pbsclient[5:9]", + "nfsclient", + "allnodes", + "disable-selinux", + "lfsloganalytics" + ] + } + }, + "install": [ + { + "script": "disable-selinux.sh", + "tag": "disable-selinux", + "sudo": true + }, + { + "script": "cndefault.sh", + "tag": "cndefault", + "sudo": true + }, + { + "script": "nfsserver.sh", + "tag": "nfsserver", + "sudo": true + }, + { + "script": "nfsclient.sh", + "args": [ + "$( Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. + +The configuration file requires the following variables to be set: + +| Variable | Description | +|-------------------------|----------------------------------------------| +| location | The location (Azure region) for the project | +| resource_group | The resource group for the project | +| storage_account | The storage account for HSM | +| storage_key | The storage key for HSM | +| storage_container | The container to use for HSM | +| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | +| la_resourcegroup | The resource group for Log Analytics | +| la_name | The Log Analytics Workspace name | + +> Note: you can remove log anaytics and/or HSM from the config file if not required. + +> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/experimental/lustre_rdma_nvmedrives/scripts/installOFED.sh b/experimental/lustre_rdma_nvmedrives/scripts/installOFED.sh new file mode 100755 index 000000000..c267519fc --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/installOFED.sh @@ -0,0 +1,4 @@ +#!/bin/bash +yum -y groupinstall --skip-broken "Infiniband Support" 2>/dev/null +echo "done installing Infiniband" +exit 0 diff --git a/experimental/lustre_rdma_nvmedrives/scripts/lfsclient.sh b/experimental/lustre_rdma_nvmedrives/scripts/lfsclient.sh new file mode 100755 index 000000000..0a3f302fc --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/lfsclient.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = mount point (default: /lustre) +master=$1 +lfs_mount=${2:-/lustre} +mkdir ~/.ssh + +cp -r /share/home/hpcuser/.ssh ~/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +chmod 777 $lfs_mount diff --git a/experimental/lustre_rdma_nvmedrives/scripts/lfsmaster.sh b/experimental/lustre_rdma_nvmedrives/scripts/lfsmaster.sh new file mode 100755 index 000000000..1869a1f71 --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/lfsmaster.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +device=$1 + +# this will only install MDS on first node in a scaleset +echo "pssh_nodenum is $PSSH_NODENUM" + +cp -r /share/home/hpcuser/.ssh /root/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab + mount -a + + # set up hsm + lctl set_param -P mdt.*-MDT0000.hsm_control=enabled + lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 + lctl set_param mdt.*-MDT0000.hsm.max_requests=128 + + # allow any user and group ids to write + lctl set_param mdt.*-MDT0000.identity_upcall=NONE + +fi + diff --git a/experimental/lustre_rdma_nvmedrives/scripts/lfsoss.sh b/experimental/lustre_rdma_nvmedrives/scripts/lfsoss.sh new file mode 100755 index 000000000..ada2bb8c7 --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/lfsoss.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# arg: $1 = lfsmaster +# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +master=$1 +device=$2 + +cp -r /share/home/hpcuser/.ssh /root/ + +index=$(($PSSH_NODENUM + 1)) +myuser="hpcuser" + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + +if [ "$PSSH_NODENUM" != "0" ]; then + lnetctl net add --net o2ib --if ib0 #double check + mkfs.lustre \ + --fsname=LustreFS \ + --backfstype=ldiskfs \ + --reformat \ + --ost \ + --mgsnode="${masterib}@o2ib" \ + --index=$index \ + --mountfsoptions="errors=remount-ro" \ + $device + + +mkdir /mnt/oss +echo "$device /mnt/oss lustre noatime,nodiratime,nobarrier 0 2" >> /etc/fstab +mount -a +fi diff --git a/experimental/lustre_rdma_nvmedrives/scripts/lfsrepo.sh b/experimental/lustre_rdma_nvmedrives/scripts/lfsrepo.sh new file mode 100755 index 000000000..db1eeb165 --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/lfsrepo.sh @@ -0,0 +1,27 @@ +#!/bin/bash +lustre_version=${1-2.10} + +cat << EOF >/etc/yum.repos.d/LustrePack.repo +[lustreserver] +name=lustreserver +baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/server/ +enabled=1 +gpgcheck=0 + +[e2fs] +name=e2fs +baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7/ +enabled=1 +gpgcheck=0 + +[lustreclient] +name=lustreclient +baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/client/ +enabled=1 +gpgcheck=0 +EOF + +#Include the correct rdma options +#cat >/etc/modprobe.d/lustre.conf</dev/null + diff --git a/experimental/lustre_rdma_nvmedrives/scripts/lustreinstall2.sh b/experimental/lustre_rdma_nvmedrives/scripts/lustreinstall2.sh new file mode 100755 index 000000000..60f3e759e --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/lustreinstall2.sh @@ -0,0 +1,10 @@ +#!/bin/bash +yum -y --nogpgcheck --enablerepo=lustreserver install kmod-lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre lustre-resource-agents +modprobe -v lustre + +sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf + +weak-modules --add-kernel --no-initramfs +systemctl enable lustre +umount /mnt/resource diff --git a/experimental/lustre_rdma_nvmedrives/scripts/lustrenetwork.sh b/experimental/lustre_rdma_nvmedrives/scripts/lustrenetwork.sh new file mode 100755 index 000000000..f95d33864 --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/lustrenetwork.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf +service waagent restart +service rdma start +modprobe lnet +lctl network configure +lnetctl net add --net o2ib --if ib0 #need this to come up every time +sleep 5 diff --git a/experimental/lustre_rdma_nvmedrives/scripts/rebootlustre.sh b/experimental/lustre_rdma_nvmedrives/scripts/rebootlustre.sh new file mode 100755 index 000000000..9d1bf38c7 --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/rebootlustre.sh @@ -0,0 +1,19 @@ +#!/bin/bash +vmlist=$1 +ossnum=$2 + +totalcount=$(($ossnum+2)) +index=0 + +#prep headnode +cp -r /share/home/hpcuser/.ssh /root/ + +#needs to be done sequentially +for vmname in ${vmlist[@]}; do + if [ $index -lt $totalcount ] ; then + echo "Rebooting $vmname" + ssh hpcuser@${vmname} "sudo reboot 2>/dev/null; exit 2>/dev/null" 2>/dev/null + fi +done +exit 0 # to ensure no errors are thrown + diff --git a/experimental/lustre_rdma_nvmedrives/writeup b/experimental/lustre_rdma_nvmedrives/writeup new file mode 100644 index 000000000..809ec71d3 --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/writeup @@ -0,0 +1,17 @@ +- lustre-ipoib - This is a created implementation of Lustre using ip over infiniband (IPoIB) +- lustre-rdma - This is a created implementation of Lustre using native Remote Direct Memory Access (RDMA) + +Changes to files to enable Infiniband functionality: +lfsmaster.sh +lfsoss.sh +lfsclient.sh +lfsrepo.sh +lfspkgs.sh + +Addition for the installation of new OFED : installOFED.sh + +Addition for correct Lustre kernel : lustreinstall1.sh +Lustre packages : lustreinstall2.sh + +Addition for rebooting of Lustre MDS/OSS: rebootlustre.sh +Addition for pause after MDS/OSS reboot : waitforreboot.sh