From 44e88d52ab4a6f2a98b277967845a6f0544c5fb8 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Mon, 16 Dec 2024 09:36:35 -0500 Subject: [PATCH] dynamic host volumes: account for other claims in capability check When we feasibility check a dynamic host volume against a volume request, we check the attachment mode and access mode. This only ensures that the capabilities match, but doesn't enforce the semantics of the capabilities against other claims that may be made on the allocation. Add support for checking the requested capability against other allocations that the volume claimed. Requires additional updates: * Job validation should allow attachment/access mode for host volumes * Added new single-node access modes and removed multi-node access modes Ref: https://github.com/hashicorp/nomad/pull/24479 --- nomad/state/state_store_host_volumes.go | 5 + nomad/structs/host_volumes.go | 61 ++++++++++-- nomad/structs/host_volumes_test.go | 124 +++++++++++++++++++++++- nomad/structs/volumes.go | 18 ++-- scheduler/feasible.go | 30 +++--- scheduler/generic_sched.go | 1 + scheduler/stack.go | 13 +-- 7 files changed, 211 insertions(+), 41 deletions(-) diff --git a/nomad/state/state_store_host_volumes.go b/nomad/state/state_store_host_volumes.go index 7e55e6ced43..5c7845a4d3d 100644 --- a/nomad/state/state_store_host_volumes.go +++ b/nomad/state/state_store_host_volumes.go @@ -30,6 +30,7 @@ func (s *StateStore) HostVolumeByID(ws memdb.WatchSet, ns, id string, withAllocs vol = vol.Copy() vol.Allocations = []*structs.AllocListStub{} + vol.Writers = []string{} // we can't use AllocsByNodeTerminal because we only want to filter out // allocs that are client-terminal, not server-terminal @@ -43,6 +44,9 @@ func (s *StateStore) HostVolumeByID(ws memdb.WatchSet, ns, id string, withAllocs } for _, volClaim := range alloc.Job.LookupTaskGroup(alloc.TaskGroup).Volumes { if volClaim.Type == structs.VolumeTypeHost && volClaim.Source == vol.Name { + if !volClaim.ReadOnly { + vol.Writers = append(vol.Writers, alloc.ID) + } vol.Allocations = append(vol.Allocations, alloc.Stub(nil)) } } @@ -101,6 +105,7 @@ func (s *StateStore) UpsertHostVolume(index uint64, vol *structs.HostVolume) err // Allocations are denormalized on read, so we don't want these to be // written to the state store. vol.Allocations = nil + vol.Writers = nil vol.ModifyIndex = index err = txn.Insert(TableHostVolumes, vol) diff --git a/nomad/structs/host_volumes.go b/nomad/structs/host_volumes.go index c254bf72902..a7a996d6b67 100644 --- a/nomad/structs/host_volumes.go +++ b/nomad/structs/host_volumes.go @@ -84,6 +84,11 @@ type HostVolume struct { // this host volume. They are denormalized on read and this field will be // never written to Raft Allocations []*AllocListStub `json:",omitempty"` + + // Writers and is a list of alloc IDs from the Allocations field that can + // write to this volume. This count is denormalized on read, never written + // to Raft, and never returned by the API. + Writers []string `json:"-"` } type HostVolumeState string @@ -245,6 +250,47 @@ func (hv *HostVolume) GetID() string { return hv.ID } +// IsAvailable determines if the volume is available for a request +func (hv *HostVolume) IsAvailable(reqAccess HostVolumeAccessMode, reqAttach HostVolumeAttachmentMode, readOnly bool, previousAllocID string) bool { + + if hv.State != HostVolumeStateReady { + return false + } + + // check that the volume has the requested capability at all + var capOk bool + for _, cap := range hv.RequestedCapabilities { + if reqAccess == cap.AccessMode && + reqAttach == cap.AttachmentMode { + capOk = true + break + } + } + if !capOk { + return false + } + + // if no other allocations claim the volume, it's first-come-first-serve to + // determine how subsequent allocs can claim it + if len(hv.Allocations) == 0 { + return true + } + + switch reqAccess { + case HostVolumeAccessModeUnknown: + return false // invalid mode + case HostVolumeAccessModeSingleNodeReader: + return readOnly + case HostVolumeAccessModeSingleNodeWriter, HostVolumeAccessModeSingleNodeSingleWriter: + return len(hv.Writers) == 0 || + (len(hv.Writers) == 1 && hv.Writers[0] == previousAllocID) + case HostVolumeAccessModeSingleNodeMultiWriter: + // no contraint + } + + return true +} + // HostVolumeCapability is the requested attachment and access mode for a volume type HostVolumeCapability struct { AttachmentMode HostVolumeAttachmentMode @@ -275,9 +321,8 @@ func (hvc *HostVolumeCapability) Validate() error { switch hvc.AccessMode { case HostVolumeAccessModeSingleNodeReader, HostVolumeAccessModeSingleNodeWriter, - HostVolumeAccessModeMultiNodeReader, - HostVolumeAccessModeMultiNodeSingleWriter, - HostVolumeAccessModeMultiNodeMultiWriter: + HostVolumeAccessModeSingleNodeSingleWriter, + HostVolumeAccessModeSingleNodeMultiWriter: default: return fmt.Errorf("invalid access mode: %q", hvc.AccessMode) } @@ -302,12 +347,10 @@ type HostVolumeAccessMode string const ( HostVolumeAccessModeUnknown HostVolumeAccessMode = "" - HostVolumeAccessModeSingleNodeReader HostVolumeAccessMode = "single-node-reader-only" - HostVolumeAccessModeSingleNodeWriter HostVolumeAccessMode = "single-node-writer" - - HostVolumeAccessModeMultiNodeReader HostVolumeAccessMode = "multi-node-reader-only" - HostVolumeAccessModeMultiNodeSingleWriter HostVolumeAccessMode = "multi-node-single-writer" - HostVolumeAccessModeMultiNodeMultiWriter HostVolumeAccessMode = "multi-node-multi-writer" + HostVolumeAccessModeSingleNodeReader HostVolumeAccessMode = "single-node-reader-only" + HostVolumeAccessModeSingleNodeWriter HostVolumeAccessMode = "single-node-writer" + HostVolumeAccessModeSingleNodeSingleWriter HostVolumeAccessMode = "single-node-single-writer" + HostVolumeAccessModeSingleNodeMultiWriter HostVolumeAccessMode = "single-node-multi-writer" ) // HostVolumeStub is used for responses for the list volumes endpoint diff --git a/nomad/structs/host_volumes_test.go b/nomad/structs/host_volumes_test.go index 499bc27d1c8..85f64a6b95a 100644 --- a/nomad/structs/host_volumes_test.go +++ b/nomad/structs/host_volumes_test.go @@ -8,6 +8,7 @@ import ( "time" "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/uuid" "github.com/shoenig/test/must" ) @@ -45,7 +46,7 @@ func TestHostVolume_Copy(t *testing.T) { out.Constraints[0].LTarget = "${meta.node_class}" out.RequestedCapabilities = append(out.RequestedCapabilities, &HostVolumeCapability{ AttachmentMode: HostVolumeAttachmentModeBlockDevice, - AccessMode: HostVolumeAccessModeMultiNodeReader, + AccessMode: HostVolumeAccessModeSingleNodeMultiWriter, }) out.Parameters["foo"] = "baz" @@ -195,7 +196,7 @@ func TestHostVolume_CanonicalizeForUpdate(t *testing.T) { RequestedCapacityMaxBytes: 500000, RequestedCapabilities: []*HostVolumeCapability{{ AttachmentMode: HostVolumeAttachmentModeFilesystem, - AccessMode: HostVolumeAccessModeMultiNodeMultiWriter, + AccessMode: HostVolumeAccessModeSingleNodeMultiWriter, }}, } existing := &HostVolume{ @@ -240,7 +241,7 @@ func TestHostVolume_CanonicalizeForUpdate(t *testing.T) { must.Eq(t, []*HostVolumeCapability{{ AttachmentMode: HostVolumeAttachmentModeFilesystem, - AccessMode: HostVolumeAccessModeMultiNodeMultiWriter, + AccessMode: HostVolumeAccessModeSingleNodeMultiWriter, }}, vol.RequestedCapabilities) must.Eq(t, "/var/nomad/alloc_mounts/82f357d6.ext4", vol.HostPath) @@ -251,3 +252,120 @@ func TestHostVolume_CanonicalizeForUpdate(t *testing.T) { must.Nil(t, vol.Allocations) } + +func TestHostVolume_IsAvailable(t *testing.T) { + + allCaps := []*HostVolumeCapability{} + + for _, accessMode := range []HostVolumeAccessMode{ + HostVolumeAccessModeSingleNodeReader, + HostVolumeAccessModeSingleNodeWriter, + HostVolumeAccessModeSingleNodeSingleWriter, + HostVolumeAccessModeSingleNodeMultiWriter, + } { + for _, attachMode := range []HostVolumeAttachmentMode{ + HostVolumeAttachmentModeFilesystem, HostVolumeAttachmentModeBlockDevice, + } { + allCaps = append(allCaps, &HostVolumeCapability{ + AttachmentMode: attachMode, + AccessMode: accessMode, + }) + } + } + + testCases := []struct { + name string + hasAllocs []string + hasWriters []string + hasCaps []*HostVolumeCapability + req *HostVolumeCapability + readOnly bool + prevAllocID string + expect bool + }{ + { + name: "enforce attachment mode", + hasCaps: []*HostVolumeCapability{{ + AttachmentMode: HostVolumeAttachmentModeBlockDevice, + AccessMode: HostVolumeAccessModeSingleNodeSingleWriter, + }}, + req: &HostVolumeCapability{ + AttachmentMode: HostVolumeAttachmentModeFilesystem, + AccessMode: HostVolumeAccessModeSingleNodeSingleWriter, + }, + expect: false, + }, + { + name: "enforce read only", + hasAllocs: []string{"a", "b", "c"}, + req: &HostVolumeCapability{ + AttachmentMode: HostVolumeAttachmentModeFilesystem, + AccessMode: HostVolumeAccessModeSingleNodeReader, + }, + expect: false, + }, + { + name: "enforce read only ok", + hasAllocs: []string{"a", "b", "c"}, + req: &HostVolumeCapability{ + AttachmentMode: HostVolumeAttachmentModeFilesystem, + AccessMode: HostVolumeAccessModeSingleNodeReader, + }, + readOnly: true, + expect: true, + }, + { + name: "enforce single writer", + hasAllocs: []string{"a", "b", "c"}, + hasWriters: []string{"b"}, + req: &HostVolumeCapability{ + AttachmentMode: HostVolumeAttachmentModeFilesystem, + AccessMode: HostVolumeAccessModeSingleNodeSingleWriter, + }, + expect: false, + }, + { + name: "enforce single writer ok across deployments", + hasAllocs: []string{"a", "b", "c"}, + hasWriters: []string{"b"}, + req: &HostVolumeCapability{ + AttachmentMode: HostVolumeAttachmentModeFilesystem, + AccessMode: HostVolumeAccessModeSingleNodeSingleWriter, + }, + prevAllocID: "b", + expect: true, + }, + { + name: "multi writer is always ok", + hasAllocs: []string{"a", "b", "c"}, + hasWriters: []string{"b", "c"}, + req: &HostVolumeCapability{ + AttachmentMode: HostVolumeAttachmentModeFilesystem, + AccessMode: HostVolumeAccessModeSingleNodeMultiWriter, + }, + expect: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + allocs := helper.ConvertSlice(tc.hasAllocs, func(id string) *AllocListStub { + return &AllocListStub{ID: id} + }) + vol := &HostVolume{ + Allocations: allocs, + Writers: tc.hasWriters, + State: HostVolumeStateReady, + } + if len(tc.hasCaps) > 0 { + vol.RequestedCapabilities = tc.hasCaps + } else { + vol.RequestedCapabilities = allCaps + } + + must.Eq(t, tc.expect, vol.IsAvailable( + tc.req.AccessMode, tc.req.AttachmentMode, tc.readOnly, tc.prevAllocID)) + }) + } + +} diff --git a/nomad/structs/volumes.go b/nomad/structs/volumes.go index b8c95fc2862..6f508ef3f27 100644 --- a/nomad/structs/volumes.go +++ b/nomad/structs/volumes.go @@ -170,16 +170,22 @@ func (v *VolumeRequest) Validate(jobType string, taskGroupCount, canaries int) e switch v.Type { case VolumeTypeHost: - if v.AttachmentMode != CSIVolumeAttachmentModeUnknown { - addErr("host volumes cannot have an attachment mode") - } - if v.AccessMode != CSIVolumeAccessModeUnknown { - addErr("host volumes cannot have an access mode") - } if v.MountOptions != nil { + // TODO(1.10.0): support mount options for dynamic host volumes addErr("host volumes cannot have mount options") } + switch v.AccessMode { + case CSIVolumeAccessModeSingleNodeReader, CSIVolumeAccessModeMultiNodeReader: + if !v.ReadOnly { + addErr("%s volumes must be read-only", v.AccessMode) + } + default: + // dynamic host volumes are all "per node" so there's no way to + // validate that other access modes work for a given volume until we + // have access to other allocations (in the scheduler) + } + case VolumeTypeCSI: switch v.AttachmentMode { diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 60442f92e7f..553f41df0aa 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -137,9 +137,10 @@ func NewRandomIterator(ctx Context, nodes []*structs.Node) *StaticIterator { // HostVolumeChecker is a FeasibilityChecker which returns whether a node has // the host volumes necessary to schedule a task group. type HostVolumeChecker struct { - ctx Context - volumeReqs []*structs.VolumeRequest - namespace string + ctx Context + volumeReqs []*structs.VolumeRequest + namespace string + previousAllocationID string } // NewHostVolumeChecker creates a HostVolumeChecker from a set of volumes @@ -151,9 +152,10 @@ func NewHostVolumeChecker(ctx Context) *HostVolumeChecker { } // SetVolumes takes the volumes required by a task group and updates the checker. -func (h *HostVolumeChecker) SetVolumes(allocName string, ns string, volumes map[string]*structs.VolumeRequest) { +func (h *HostVolumeChecker) SetVolumes(allocName string, ns string, volumes map[string]*structs.VolumeRequest, prevAllocID string) { h.namespace = ns h.volumeReqs = []*structs.VolumeRequest{} + h.previousAllocationID = prevAllocID for _, req := range volumes { if req.Type != structs.VolumeTypeHost { continue // filter CSI volumes @@ -194,7 +196,7 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { } if volCfg.ID != "" { // dynamic host volume - vol, err := h.ctx.State().HostVolumeByID(nil, h.namespace, volCfg.ID, false) + vol, err := h.ctx.State().HostVolumeByID(nil, h.namespace, volCfg.ID, true) if err != nil || vol == nil { // node fingerprint has a dynamic volume that's no longer in the // state store; this is only possible if the batched fingerprint @@ -202,18 +204,12 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { // raft entry completes return false } - if vol.State != structs.HostVolumeStateReady { - return false - } - var capOk bool - for _, cap := range vol.RequestedCapabilities { - if req.AccessMode == structs.CSIVolumeAccessMode(cap.AccessMode) && - req.AttachmentMode == structs.CSIVolumeAttachmentMode(cap.AttachmentMode) { - capOk = true - break - } - } - if !capOk { + if !vol.IsAvailable( + structs.HostVolumeAccessMode(req.AccessMode), + structs.HostVolumeAttachmentMode(req.AttachmentMode), + req.ReadOnly, + h.previousAllocationID, + ) { return false } } else if !req.ReadOnly { diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index f9fd669e592..525320bcceb 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -838,6 +838,7 @@ func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs } } selectOptions.PenaltyNodeIDs = penaltyNodes + selectOptions.PreviousAllocID = prevAllocation.ID } if preferredNode != nil { selectOptions.PreferredNodes = []*structs.Node{preferredNode} diff --git a/scheduler/stack.go b/scheduler/stack.go index 1f2b6586886..22b11e4e6c1 100644 --- a/scheduler/stack.go +++ b/scheduler/stack.go @@ -35,10 +35,11 @@ type Stack interface { } type SelectOptions struct { - PenaltyNodeIDs map[string]struct{} - PreferredNodes []*structs.Node - Preempt bool - AllocName string + PenaltyNodeIDs map[string]struct{} + PreferredNodes []*structs.Node + Preempt bool + AllocName string + PreviousAllocID string } // GenericStack is the Stack used for the Generic scheduler. It is @@ -156,7 +157,7 @@ func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) *Ra s.taskGroupDrivers.SetDrivers(tgConstr.drivers) s.taskGroupConstraint.SetConstraints(tgConstr.constraints) s.taskGroupDevices.SetTaskGroup(tg) - s.taskGroupHostVolumes.SetVolumes(options.AllocName, s.jobNamespace, tg.Volumes) + s.taskGroupHostVolumes.SetVolumes(options.AllocName, s.jobNamespace, tg.Volumes, options.PreviousAllocID) s.taskGroupCSIVolumes.SetVolumes(options.AllocName, tg.Volumes) if len(tg.Networks) > 0 { s.taskGroupNetwork.SetNetwork(tg.Networks[0]) @@ -349,7 +350,7 @@ func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) *Ran s.taskGroupDrivers.SetDrivers(tgConstr.drivers) s.taskGroupConstraint.SetConstraints(tgConstr.constraints) s.taskGroupDevices.SetTaskGroup(tg) - s.taskGroupHostVolumes.SetVolumes(options.AllocName, s.jobNamespace, tg.Volumes) + s.taskGroupHostVolumes.SetVolumes(options.AllocName, s.jobNamespace, tg.Volumes, options.PreviousAllocID) s.taskGroupCSIVolumes.SetVolumes(options.AllocName, tg.Volumes) if len(tg.Networks) > 0 { s.taskGroupNetwork.SetNetwork(tg.Networks[0])