diff --git a/nomad/state/state_store_host_volumes.go b/nomad/state/state_store_host_volumes.go index 7e55e6ced4..5c7845a4d3 100644 --- a/nomad/state/state_store_host_volumes.go +++ b/nomad/state/state_store_host_volumes.go @@ -30,6 +30,7 @@ func (s *StateStore) HostVolumeByID(ws memdb.WatchSet, ns, id string, withAllocs vol = vol.Copy() vol.Allocations = []*structs.AllocListStub{} + vol.Writers = []string{} // we can't use AllocsByNodeTerminal because we only want to filter out // allocs that are client-terminal, not server-terminal @@ -43,6 +44,9 @@ func (s *StateStore) HostVolumeByID(ws memdb.WatchSet, ns, id string, withAllocs } for _, volClaim := range alloc.Job.LookupTaskGroup(alloc.TaskGroup).Volumes { if volClaim.Type == structs.VolumeTypeHost && volClaim.Source == vol.Name { + if !volClaim.ReadOnly { + vol.Writers = append(vol.Writers, alloc.ID) + } vol.Allocations = append(vol.Allocations, alloc.Stub(nil)) } } @@ -101,6 +105,7 @@ func (s *StateStore) UpsertHostVolume(index uint64, vol *structs.HostVolume) err // Allocations are denormalized on read, so we don't want these to be // written to the state store. vol.Allocations = nil + vol.Writers = nil vol.ModifyIndex = index err = txn.Insert(TableHostVolumes, vol) diff --git a/nomad/structs/host_volumes.go b/nomad/structs/host_volumes.go index c254bf7290..a7a996d6b6 100644 --- a/nomad/structs/host_volumes.go +++ b/nomad/structs/host_volumes.go @@ -84,6 +84,11 @@ type HostVolume struct { // this host volume. They are denormalized on read and this field will be // never written to Raft Allocations []*AllocListStub `json:",omitempty"` + + // Writers and is a list of alloc IDs from the Allocations field that can + // write to this volume. This count is denormalized on read, never written + // to Raft, and never returned by the API. + Writers []string `json:"-"` } type HostVolumeState string @@ -245,6 +250,47 @@ func (hv *HostVolume) GetID() string { return hv.ID } +// IsAvailable determines if the volume is available for a request +func (hv *HostVolume) IsAvailable(reqAccess HostVolumeAccessMode, reqAttach HostVolumeAttachmentMode, readOnly bool, previousAllocID string) bool { + + if hv.State != HostVolumeStateReady { + return false + } + + // check that the volume has the requested capability at all + var capOk bool + for _, cap := range hv.RequestedCapabilities { + if reqAccess == cap.AccessMode && + reqAttach == cap.AttachmentMode { + capOk = true + break + } + } + if !capOk { + return false + } + + // if no other allocations claim the volume, it's first-come-first-serve to + // determine how subsequent allocs can claim it + if len(hv.Allocations) == 0 { + return true + } + + switch reqAccess { + case HostVolumeAccessModeUnknown: + return false // invalid mode + case HostVolumeAccessModeSingleNodeReader: + return readOnly + case HostVolumeAccessModeSingleNodeWriter, HostVolumeAccessModeSingleNodeSingleWriter: + return len(hv.Writers) == 0 || + (len(hv.Writers) == 1 && hv.Writers[0] == previousAllocID) + case HostVolumeAccessModeSingleNodeMultiWriter: + // no contraint + } + + return true +} + // HostVolumeCapability is the requested attachment and access mode for a volume type HostVolumeCapability struct { AttachmentMode HostVolumeAttachmentMode @@ -275,9 +321,8 @@ func (hvc *HostVolumeCapability) Validate() error { switch hvc.AccessMode { case HostVolumeAccessModeSingleNodeReader, HostVolumeAccessModeSingleNodeWriter, - HostVolumeAccessModeMultiNodeReader, - HostVolumeAccessModeMultiNodeSingleWriter, - HostVolumeAccessModeMultiNodeMultiWriter: + HostVolumeAccessModeSingleNodeSingleWriter, + HostVolumeAccessModeSingleNodeMultiWriter: default: return fmt.Errorf("invalid access mode: %q", hvc.AccessMode) } @@ -302,12 +347,10 @@ type HostVolumeAccessMode string const ( HostVolumeAccessModeUnknown HostVolumeAccessMode = "" - HostVolumeAccessModeSingleNodeReader HostVolumeAccessMode = "single-node-reader-only" - HostVolumeAccessModeSingleNodeWriter HostVolumeAccessMode = "single-node-writer" - - HostVolumeAccessModeMultiNodeReader HostVolumeAccessMode = "multi-node-reader-only" - HostVolumeAccessModeMultiNodeSingleWriter HostVolumeAccessMode = "multi-node-single-writer" - HostVolumeAccessModeMultiNodeMultiWriter HostVolumeAccessMode = "multi-node-multi-writer" + HostVolumeAccessModeSingleNodeReader HostVolumeAccessMode = "single-node-reader-only" + HostVolumeAccessModeSingleNodeWriter HostVolumeAccessMode = "single-node-writer" + HostVolumeAccessModeSingleNodeSingleWriter HostVolumeAccessMode = "single-node-single-writer" + HostVolumeAccessModeSingleNodeMultiWriter HostVolumeAccessMode = "single-node-multi-writer" ) // HostVolumeStub is used for responses for the list volumes endpoint diff --git a/nomad/structs/host_volumes_test.go b/nomad/structs/host_volumes_test.go index 499bc27d1c..85f64a6b95 100644 --- a/nomad/structs/host_volumes_test.go +++ b/nomad/structs/host_volumes_test.go @@ -8,6 +8,7 @@ import ( "time" "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/uuid" "github.com/shoenig/test/must" ) @@ -45,7 +46,7 @@ func TestHostVolume_Copy(t *testing.T) { out.Constraints[0].LTarget = "${meta.node_class}" out.RequestedCapabilities = append(out.RequestedCapabilities, &HostVolumeCapability{ AttachmentMode: HostVolumeAttachmentModeBlockDevice, - AccessMode: HostVolumeAccessModeMultiNodeReader, + AccessMode: HostVolumeAccessModeSingleNodeMultiWriter, }) out.Parameters["foo"] = "baz" @@ -195,7 +196,7 @@ func TestHostVolume_CanonicalizeForUpdate(t *testing.T) { RequestedCapacityMaxBytes: 500000, RequestedCapabilities: []*HostVolumeCapability{{ AttachmentMode: HostVolumeAttachmentModeFilesystem, - AccessMode: HostVolumeAccessModeMultiNodeMultiWriter, + AccessMode: HostVolumeAccessModeSingleNodeMultiWriter, }}, } existing := &HostVolume{ @@ -240,7 +241,7 @@ func TestHostVolume_CanonicalizeForUpdate(t *testing.T) { must.Eq(t, []*HostVolumeCapability{{ AttachmentMode: HostVolumeAttachmentModeFilesystem, - AccessMode: HostVolumeAccessModeMultiNodeMultiWriter, + AccessMode: HostVolumeAccessModeSingleNodeMultiWriter, }}, vol.RequestedCapabilities) must.Eq(t, "/var/nomad/alloc_mounts/82f357d6.ext4", vol.HostPath) @@ -251,3 +252,120 @@ func TestHostVolume_CanonicalizeForUpdate(t *testing.T) { must.Nil(t, vol.Allocations) } + +func TestHostVolume_IsAvailable(t *testing.T) { + + allCaps := []*HostVolumeCapability{} + + for _, accessMode := range []HostVolumeAccessMode{ + HostVolumeAccessModeSingleNodeReader, + HostVolumeAccessModeSingleNodeWriter, + HostVolumeAccessModeSingleNodeSingleWriter, + HostVolumeAccessModeSingleNodeMultiWriter, + } { + for _, attachMode := range []HostVolumeAttachmentMode{ + HostVolumeAttachmentModeFilesystem, HostVolumeAttachmentModeBlockDevice, + } { + allCaps = append(allCaps, &HostVolumeCapability{ + AttachmentMode: attachMode, + AccessMode: accessMode, + }) + } + } + + testCases := []struct { + name string + hasAllocs []string + hasWriters []string + hasCaps []*HostVolumeCapability + req *HostVolumeCapability + readOnly bool + prevAllocID string + expect bool + }{ + { + name: "enforce attachment mode", + hasCaps: []*HostVolumeCapability{{ + AttachmentMode: HostVolumeAttachmentModeBlockDevice, + AccessMode: HostVolumeAccessModeSingleNodeSingleWriter, + }}, + req: &HostVolumeCapability{ + AttachmentMode: HostVolumeAttachmentModeFilesystem, + AccessMode: HostVolumeAccessModeSingleNodeSingleWriter, + }, + expect: false, + }, + { + name: "enforce read only", + hasAllocs: []string{"a", "b", "c"}, + req: &HostVolumeCapability{ + AttachmentMode: HostVolumeAttachmentModeFilesystem, + AccessMode: HostVolumeAccessModeSingleNodeReader, + }, + expect: false, + }, + { + name: "enforce read only ok", + hasAllocs: []string{"a", "b", "c"}, + req: &HostVolumeCapability{ + AttachmentMode: HostVolumeAttachmentModeFilesystem, + AccessMode: HostVolumeAccessModeSingleNodeReader, + }, + readOnly: true, + expect: true, + }, + { + name: "enforce single writer", + hasAllocs: []string{"a", "b", "c"}, + hasWriters: []string{"b"}, + req: &HostVolumeCapability{ + AttachmentMode: HostVolumeAttachmentModeFilesystem, + AccessMode: HostVolumeAccessModeSingleNodeSingleWriter, + }, + expect: false, + }, + { + name: "enforce single writer ok across deployments", + hasAllocs: []string{"a", "b", "c"}, + hasWriters: []string{"b"}, + req: &HostVolumeCapability{ + AttachmentMode: HostVolumeAttachmentModeFilesystem, + AccessMode: HostVolumeAccessModeSingleNodeSingleWriter, + }, + prevAllocID: "b", + expect: true, + }, + { + name: "multi writer is always ok", + hasAllocs: []string{"a", "b", "c"}, + hasWriters: []string{"b", "c"}, + req: &HostVolumeCapability{ + AttachmentMode: HostVolumeAttachmentModeFilesystem, + AccessMode: HostVolumeAccessModeSingleNodeMultiWriter, + }, + expect: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + allocs := helper.ConvertSlice(tc.hasAllocs, func(id string) *AllocListStub { + return &AllocListStub{ID: id} + }) + vol := &HostVolume{ + Allocations: allocs, + Writers: tc.hasWriters, + State: HostVolumeStateReady, + } + if len(tc.hasCaps) > 0 { + vol.RequestedCapabilities = tc.hasCaps + } else { + vol.RequestedCapabilities = allCaps + } + + must.Eq(t, tc.expect, vol.IsAvailable( + tc.req.AccessMode, tc.req.AttachmentMode, tc.readOnly, tc.prevAllocID)) + }) + } + +} diff --git a/nomad/structs/volumes.go b/nomad/structs/volumes.go index b8c95fc286..6f508ef3f2 100644 --- a/nomad/structs/volumes.go +++ b/nomad/structs/volumes.go @@ -170,16 +170,22 @@ func (v *VolumeRequest) Validate(jobType string, taskGroupCount, canaries int) e switch v.Type { case VolumeTypeHost: - if v.AttachmentMode != CSIVolumeAttachmentModeUnknown { - addErr("host volumes cannot have an attachment mode") - } - if v.AccessMode != CSIVolumeAccessModeUnknown { - addErr("host volumes cannot have an access mode") - } if v.MountOptions != nil { + // TODO(1.10.0): support mount options for dynamic host volumes addErr("host volumes cannot have mount options") } + switch v.AccessMode { + case CSIVolumeAccessModeSingleNodeReader, CSIVolumeAccessModeMultiNodeReader: + if !v.ReadOnly { + addErr("%s volumes must be read-only", v.AccessMode) + } + default: + // dynamic host volumes are all "per node" so there's no way to + // validate that other access modes work for a given volume until we + // have access to other allocations (in the scheduler) + } + case VolumeTypeCSI: switch v.AttachmentMode { diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 60442f92e7..553f41df0a 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -137,9 +137,10 @@ func NewRandomIterator(ctx Context, nodes []*structs.Node) *StaticIterator { // HostVolumeChecker is a FeasibilityChecker which returns whether a node has // the host volumes necessary to schedule a task group. type HostVolumeChecker struct { - ctx Context - volumeReqs []*structs.VolumeRequest - namespace string + ctx Context + volumeReqs []*structs.VolumeRequest + namespace string + previousAllocationID string } // NewHostVolumeChecker creates a HostVolumeChecker from a set of volumes @@ -151,9 +152,10 @@ func NewHostVolumeChecker(ctx Context) *HostVolumeChecker { } // SetVolumes takes the volumes required by a task group and updates the checker. -func (h *HostVolumeChecker) SetVolumes(allocName string, ns string, volumes map[string]*structs.VolumeRequest) { +func (h *HostVolumeChecker) SetVolumes(allocName string, ns string, volumes map[string]*structs.VolumeRequest, prevAllocID string) { h.namespace = ns h.volumeReqs = []*structs.VolumeRequest{} + h.previousAllocationID = prevAllocID for _, req := range volumes { if req.Type != structs.VolumeTypeHost { continue // filter CSI volumes @@ -194,7 +196,7 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { } if volCfg.ID != "" { // dynamic host volume - vol, err := h.ctx.State().HostVolumeByID(nil, h.namespace, volCfg.ID, false) + vol, err := h.ctx.State().HostVolumeByID(nil, h.namespace, volCfg.ID, true) if err != nil || vol == nil { // node fingerprint has a dynamic volume that's no longer in the // state store; this is only possible if the batched fingerprint @@ -202,18 +204,12 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { // raft entry completes return false } - if vol.State != structs.HostVolumeStateReady { - return false - } - var capOk bool - for _, cap := range vol.RequestedCapabilities { - if req.AccessMode == structs.CSIVolumeAccessMode(cap.AccessMode) && - req.AttachmentMode == structs.CSIVolumeAttachmentMode(cap.AttachmentMode) { - capOk = true - break - } - } - if !capOk { + if !vol.IsAvailable( + structs.HostVolumeAccessMode(req.AccessMode), + structs.HostVolumeAttachmentMode(req.AttachmentMode), + req.ReadOnly, + h.previousAllocationID, + ) { return false } } else if !req.ReadOnly { diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index f9fd669e59..525320bcce 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -838,6 +838,7 @@ func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs } } selectOptions.PenaltyNodeIDs = penaltyNodes + selectOptions.PreviousAllocID = prevAllocation.ID } if preferredNode != nil { selectOptions.PreferredNodes = []*structs.Node{preferredNode} diff --git a/scheduler/stack.go b/scheduler/stack.go index 1f2b658688..22b11e4e6c 100644 --- a/scheduler/stack.go +++ b/scheduler/stack.go @@ -35,10 +35,11 @@ type Stack interface { } type SelectOptions struct { - PenaltyNodeIDs map[string]struct{} - PreferredNodes []*structs.Node - Preempt bool - AllocName string + PenaltyNodeIDs map[string]struct{} + PreferredNodes []*structs.Node + Preempt bool + AllocName string + PreviousAllocID string } // GenericStack is the Stack used for the Generic scheduler. It is @@ -156,7 +157,7 @@ func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) *Ra s.taskGroupDrivers.SetDrivers(tgConstr.drivers) s.taskGroupConstraint.SetConstraints(tgConstr.constraints) s.taskGroupDevices.SetTaskGroup(tg) - s.taskGroupHostVolumes.SetVolumes(options.AllocName, s.jobNamespace, tg.Volumes) + s.taskGroupHostVolumes.SetVolumes(options.AllocName, s.jobNamespace, tg.Volumes, options.PreviousAllocID) s.taskGroupCSIVolumes.SetVolumes(options.AllocName, tg.Volumes) if len(tg.Networks) > 0 { s.taskGroupNetwork.SetNetwork(tg.Networks[0]) @@ -349,7 +350,7 @@ func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) *Ran s.taskGroupDrivers.SetDrivers(tgConstr.drivers) s.taskGroupConstraint.SetConstraints(tgConstr.constraints) s.taskGroupDevices.SetTaskGroup(tg) - s.taskGroupHostVolumes.SetVolumes(options.AllocName, s.jobNamespace, tg.Volumes) + s.taskGroupHostVolumes.SetVolumes(options.AllocName, s.jobNamespace, tg.Volumes, options.PreviousAllocID) s.taskGroupCSIVolumes.SetVolumes(options.AllocName, tg.Volumes) if len(tg.Networks) > 0 { s.taskGroupNetwork.SetNetwork(tg.Networks[0])