Skip to content
This repository has been archived by the owner on Mar 30, 2023. It is now read-only.

Commit

Permalink
Add option to override MGS host (#132)
Browse files Browse the repository at this point in the history
Added the config:
DAC_MGS_HOST

By default it is localhost, which is existing behaviour where
MGS always lives on the primary brick host.

We found operational issues of when to clean out the MGS after
creating lots of filesystems. Eventually the MGS will fill up.
As a work around, we allow operators to specify a custom location
(host and device) for the MGS. This means you could easily rotate
a single MGS (via change of config followed by reboot of dacd)
that ensures the correct thing happens.
  • Loading branch information
JohnGarbutt authored Jun 2, 2020
1 parent 6ba4e76 commit 3d96be8
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 8 deletions.
2 changes: 2 additions & 0 deletions internal/pkg/config/filesystem.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package config

type FilesystemConfig struct {
MGSDevice string
MGSHost string
MaxMDTs uint
HostGroup string
AnsibleDir string
Expand All @@ -14,6 +15,7 @@ func GetFilesystemConfig() FilesystemConfig {
env := DefaultEnv
conf := FilesystemConfig{
MGSDevice: getString(env, "DAC_MGS_DEV", "sdb"),
MGSHost: getString(env, "DAC_MGS_HOST", "localhost"),
MaxMDTs: getUint(env, "DAC_MAX_MDT_COUNT", 24),
HostGroup: getString(env, "DAC_HOST_GROUP", "dac-prod"),
AnsibleDir: getString(env, "DAC_ANSIBLE_DIR", "/var/lib/data-acc/fs-ansible/"),
Expand Down
23 changes: 18 additions & 5 deletions internal/pkg/filesystem_impl/ansible.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ func (*ansibleImpl) CreateEnvironment(session datamodel.Session) (string, error)
return setupAnsible(Lustre, session.FilesystemStatus.InternalName, session.AllocatedBricks)
}

var conf = config.GetFilesystemConfig()

func getFSInfo(fsType FSType, fsUuid string, allBricks []datamodel.Brick) FSInfo {
func getFSInfo(fsType FSType, fsUuid string, allBricks []datamodel.Brick, conf config.FilesystemConfig) FSInfo {
// give all bricks an index, using the random ordering of allBricks
var allAllocations []datamodel.BrickAllocation
for i, brick := range allBricks {
Expand Down Expand Up @@ -101,7 +99,7 @@ func getFSInfo(fsType FSType, fsUuid string, allBricks []datamodel.Brick) FSInfo
hostInfo := HostInfo{hostName: string(host), OSTS: osts, MDTS: mdts}

isPrimaryBrick := allocations[0].AllocatedIndex == 0
if isPrimaryBrick {
if isPrimaryBrick && conf.MGSHost == "localhost" {
if fsType == Lustre {
hostInfo.MGS = conf.MGSDevice
} else {
Expand All @@ -111,6 +109,18 @@ func getFSInfo(fsType FSType, fsUuid string, allBricks []datamodel.Brick) FSInfo
}
hosts[string(host)] = hostInfo
}

// Add MGSHost override, which may or may not be an existing host
if conf.MGSHost != "localhost" && fsType == Lustre {
hostInfo, ok := hosts[conf.MGSHost]
if !ok {
hostInfo = HostInfo{hostName: conf.MGSHost}
}
hostInfo.MGS = conf.MGSDevice
hosts[conf.MGSHost] = hostInfo
mgsnode = hostInfo.MGS
}

// TODO: add attachments?
fsinfo := FSInfo{
Vars: map[string]string{
Expand All @@ -126,7 +136,8 @@ func getFSInfo(fsType FSType, fsUuid string, allBricks []datamodel.Brick) FSInfo
}

func getInventory(fsType FSType, fsUuid string, allBricks []datamodel.Brick) string {
fsinfo := getFSInfo(fsType, fsUuid, allBricks)
conf := config.GetFilesystemConfig()
fsinfo := getFSInfo(fsType, fsUuid, allBricks, conf)
fsname := fmt.Sprintf("%s", fsUuid)
data := Wrapper{Dacs: FileSystems{Children: map[string]FSInfo{fsname: fsinfo}}}

Expand Down Expand Up @@ -155,6 +166,7 @@ func getPlaybook(fsType FSType, fsUuid string) string {
}

func getAnsibleDir(suffix string) string {
conf := config.GetFilesystemConfig()
return path.Join(conf.AnsibleDir, suffix)
}

Expand Down Expand Up @@ -251,6 +263,7 @@ func executeAnsiblePlaybook(dir string, args string) error {
cmdStr := fmt.Sprintf(`cd %s; . .venv/bin/activate; ansible-playbook %s;`, dir, args)
log.Println("Requested ansible:", cmdStr)

conf := config.GetFilesystemConfig()
if conf.SkipAnsible {
log.Println("Skip as DAC_SKIP_ANSIBLE=True")
time.Sleep(time.Millisecond * 200)
Expand Down
27 changes: 24 additions & 3 deletions internal/pkg/filesystem_impl/ansible_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package filesystem_impl

import (
"fmt"
"github.com/RSE-Cambridge/data-acc/internal/pkg/config"
"github.com/RSE-Cambridge/data-acc/internal/pkg/datamodel"
"github.com/stretchr/testify/assert"
"testing"
Expand Down Expand Up @@ -185,18 +186,38 @@ func TestPlugin_GetFSInfo_MaxMDT_lessHosts(t *testing.T) {
}

fsUuid := "abcdefgh"
result := getFSInfo(Lustre, fsUuid, brickAllocations)
conf := config.GetFilesystemConfig()
conf.MGSHost = "dac5"
conf.MGSDevice = "loop0"
result := getFSInfo(Lustre, fsUuid, brickAllocations, conf)
resultStr := fmt.Sprintf("%+v", result.Hosts)
expected := `map[` +
`dac1:{hostName:dac1 MGS:sdb MDTS:map[nvme1n1:0 nvme2n1:1 nvme3n1:2 nvme4n1:3] ` +
`dac1:{hostName:dac1 MGS: MDTS:map[nvme1n1:0 nvme2n1:1 nvme3n1:2 nvme4n1:3] ` +
`OSTS:map[nvme11n1:30 nvme1n1:0 nvme2n1:1 nvme3n1:2 nvme4n1:3 nvme5n1:4 nvme6n1:5]} ` +
`dac2:{hostName:dac2 MGS: MDTS:map[nvme1n1:4 nvme2n1:5 nvme3n1:6 nvme4n1:7] ` +
`OSTS:map[nvme11n1:31 nvme1n1:6 nvme2n1:7 nvme3n1:8 nvme4n1:9 nvme5n1:10 nvme6n1:11]} ` +
`dac3:{hostName:dac3 MGS: MDTS:map[nvme1n1:8 nvme2n1:9 nvme3n1:10 nvme4n1:11] ` +
`OSTS:map[nvme1n1:12 nvme2n1:13 nvme3n1:14 nvme4n1:15 nvme5n1:16 nvme6n1:17]} ` +
`dac4:{hostName:dac4 MGS: MDTS:map[nvme1n1:12 nvme2n1:13 nvme3n1:14 nvme4n1:15] ` +
`OSTS:map[nvme1n1:18 nvme2n1:19 nvme3n1:20 nvme4n1:21 nvme5n1:22 nvme6n1:23]} ` +
`dac5:{hostName:dac5 MGS: MDTS:map[nvme1n1:16 nvme2n1:17 nvme3n1:18 nvme4n1:19] ` +
`dac5:{hostName:dac5 MGS:loop0 MDTS:map[nvme1n1:16 nvme2n1:17 nvme3n1:18 nvme4n1:19] ` +
`OSTS:map[nvme1n1:24 nvme2n1:25 nvme3n1:26 nvme4n1:27 nvme5n1:28 nvme6n1:29]}]`
assert.Equal(t, expected, resultStr)

conf.MGSHost = "slurmmaster1"
result2 := getFSInfo(Lustre, fsUuid, brickAllocations, conf)
resultStr2 := fmt.Sprintf("%+v", result2.Hosts)
expected2 := `map[` +
`dac1:{hostName:dac1 MGS: MDTS:map[nvme1n1:0 nvme2n1:1 nvme3n1:2 nvme4n1:3] ` +
`OSTS:map[nvme11n1:30 nvme1n1:0 nvme2n1:1 nvme3n1:2 nvme4n1:3 nvme5n1:4 nvme6n1:5]} ` +
`dac2:{hostName:dac2 MGS: MDTS:map[nvme1n1:4 nvme2n1:5 nvme3n1:6 nvme4n1:7] ` +
`OSTS:map[nvme11n1:31 nvme1n1:6 nvme2n1:7 nvme3n1:8 nvme4n1:9 nvme5n1:10 nvme6n1:11]} ` +
`dac3:{hostName:dac3 MGS: MDTS:map[nvme1n1:8 nvme2n1:9 nvme3n1:10 nvme4n1:11] ` +
`OSTS:map[nvme1n1:12 nvme2n1:13 nvme3n1:14 nvme4n1:15 nvme5n1:16 nvme6n1:17]} ` +
`dac4:{hostName:dac4 MGS: MDTS:map[nvme1n1:12 nvme2n1:13 nvme3n1:14 nvme4n1:15] ` +
`OSTS:map[nvme1n1:18 nvme2n1:19 nvme3n1:20 nvme4n1:21 nvme5n1:22 nvme6n1:23]} ` +
`dac5:{hostName:dac5 MGS: MDTS:map[nvme1n1:16 nvme2n1:17 nvme3n1:18 nvme4n1:19] ` +
`OSTS:map[nvme1n1:24 nvme2n1:25 nvme3n1:26 nvme4n1:27 nvme5n1:28 nvme6n1:29]} `+
`slurmmaster1:{hostName:slurmmaster1 MGS:loop0 MDTS:map[] OSTS:map[]}]`
assert.Equal(t, expected2, resultStr2)
}
4 changes: 4 additions & 0 deletions internal/pkg/filesystem_impl/mount.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package filesystem_impl

import (
"fmt"
"github.com/RSE-Cambridge/data-acc/internal/pkg/config"
"github.com/RSE-Cambridge/data-acc/internal/pkg/datamodel"
"log"
"os/exec"
Expand Down Expand Up @@ -34,6 +35,7 @@ func mount(fsType FSType, sessionName datamodel.SessionName, isMultiJob bool, in
// TODO: Move Lustre mount here that is done below
//executeAnsibleMount(fsType, volume, brickAllocations)
}
conf := config.GetFilesystemConfig()
var mountDir = getMountDir(sessionName, isMultiJob, attachment.SessionName)

for _, attachHost := range attachment.Hosts {
Expand Down Expand Up @@ -224,6 +226,7 @@ func mountRemoteFilesystem(fsType FSType, hostname string, lnetSuffix string, mg
func mountLustre(hostname string, lnetSuffix string, mgtHost string, fsname string, directory string) error {
// We assume modprobe -v lustre is already done
// First check if we are mounted already
conf := config.GetFilesystemConfig()
if err := runner.Execute(hostname, true, fmt.Sprintf("grep %s /etc/mtab", directory)); err != nil || conf.SkipAnsible {
if err := runner.Execute(hostname, true, fmt.Sprintf(
"mount -t lustre -o flock,nodev,nosuid %s%s:/%s %s",
Expand Down Expand Up @@ -258,6 +261,7 @@ type run struct {
func (*run) Execute(hostname string, asRoot bool, cmdStr string) error {
log.Println("SSH to:", hostname, "with command:", cmdStr)

conf := config.GetFilesystemConfig()
if conf.SkipAnsible {
log.Println("Skip as DAC_SKIP_ANSIBLE=True")
time.Sleep(time.Millisecond * 200)
Expand Down

0 comments on commit 3d96be8

Please sign in to comment.