diff --git a/.changelog/24658.txt b/.changelog/24658.txt new file mode 100644 index 00000000000..f46da955490 --- /dev/null +++ b/.changelog/24658.txt @@ -0,0 +1,3 @@ +```release-note:bug +networking: check network namespaces on Linux during client restarts and fail the allocation if an existing namespace is invalid +``` diff --git a/client/allocrunner/network_hook.go b/client/allocrunner/network_hook.go index 570943b3795..af72ba7df5a 100644 --- a/client/allocrunner/network_hook.go +++ b/client/allocrunner/network_hook.go @@ -142,11 +142,14 @@ func (h *networkHook) Prerun() error { h.isolationSetter.SetNetworkIsolation(spec) } - if created { - status, err := h.networkConfigurator.Setup(context.TODO(), h.alloc, spec) + if spec != nil { + status, err := h.networkConfigurator.Setup(context.TODO(), h.alloc, spec, created) if err != nil { return fmt.Errorf("failed to configure networking for alloc: %v", err) } + if status == nil { + return nil // netns already existed and was correctly configured + } // If the driver set the sandbox hostname label, then we will use that // to set the HostsConfig.Hostname. Otherwise, identify the sandbox diff --git a/client/allocrunner/networking.go b/client/allocrunner/networking.go index e98af0b4f89..07a98660795 100644 --- a/client/allocrunner/networking.go +++ b/client/allocrunner/networking.go @@ -14,7 +14,7 @@ import ( // NetworkConfigurator sets up and tears down the interfaces, routes, firewall // rules, etc for the configured networking mode of the allocation. type NetworkConfigurator interface { - Setup(context.Context, *structs.Allocation, *drivers.NetworkIsolationSpec) (*structs.AllocNetworkStatus, error) + Setup(context.Context, *structs.Allocation, *drivers.NetworkIsolationSpec, bool) (*structs.AllocNetworkStatus, error) Teardown(context.Context, *structs.Allocation, *drivers.NetworkIsolationSpec) error } @@ -23,7 +23,7 @@ type NetworkConfigurator interface { // require further configuration type hostNetworkConfigurator struct{} -func (h *hostNetworkConfigurator) Setup(context.Context, *structs.Allocation, *drivers.NetworkIsolationSpec) (*structs.AllocNetworkStatus, error) { +func (h *hostNetworkConfigurator) Setup(context.Context, *structs.Allocation, *drivers.NetworkIsolationSpec, bool) (*structs.AllocNetworkStatus, error) { return nil, nil } func (h *hostNetworkConfigurator) Teardown(context.Context, *structs.Allocation, *drivers.NetworkIsolationSpec) error { @@ -41,10 +41,10 @@ type synchronizedNetworkConfigurator struct { nc NetworkConfigurator } -func (s *synchronizedNetworkConfigurator) Setup(ctx context.Context, allocation *structs.Allocation, spec *drivers.NetworkIsolationSpec) (*structs.AllocNetworkStatus, error) { +func (s *synchronizedNetworkConfigurator) Setup(ctx context.Context, allocation *structs.Allocation, spec *drivers.NetworkIsolationSpec, created bool) (*structs.AllocNetworkStatus, error) { networkingGlobalMutex.Lock() defer networkingGlobalMutex.Unlock() - return s.nc.Setup(ctx, allocation, spec) + return s.nc.Setup(ctx, allocation, spec, created) } func (s *synchronizedNetworkConfigurator) Teardown(ctx context.Context, allocation *structs.Allocation, spec *drivers.NetworkIsolationSpec) error { diff --git a/client/allocrunner/networking_bridge_linux.go b/client/allocrunner/networking_bridge_linux.go index 63a11c8f093..1204837b0dd 100644 --- a/client/allocrunner/networking_bridge_linux.go +++ b/client/allocrunner/networking_bridge_linux.go @@ -120,12 +120,12 @@ func (b *bridgeNetworkConfigurator) ensureForwardingRules() error { } // Setup calls the CNI plugins with the add action -func (b *bridgeNetworkConfigurator) Setup(ctx context.Context, alloc *structs.Allocation, spec *drivers.NetworkIsolationSpec) (*structs.AllocNetworkStatus, error) { +func (b *bridgeNetworkConfigurator) Setup(ctx context.Context, alloc *structs.Allocation, spec *drivers.NetworkIsolationSpec, created bool) (*structs.AllocNetworkStatus, error) { if err := b.ensureForwardingRules(); err != nil { return nil, fmt.Errorf("failed to initialize table forwarding rules: %v", err) } - return b.cni.Setup(ctx, alloc, spec) + return b.cni.Setup(ctx, alloc, spec, created) } // Teardown calls the CNI plugins with the delete action diff --git a/client/allocrunner/networking_cni.go b/client/allocrunner/networking_cni.go index 1c877fbe2ff..30d73c0d6f3 100644 --- a/client/allocrunner/networking_cni.go +++ b/client/allocrunner/networking_cni.go @@ -139,7 +139,8 @@ func addNomadWorkloadCNIArgs(logger log.Logger, alloc *structs.Allocation, cniAr } // Setup calls the CNI plugins with the add action -func (c *cniNetworkConfigurator) Setup(ctx context.Context, alloc *structs.Allocation, spec *drivers.NetworkIsolationSpec) (*structs.AllocNetworkStatus, error) { +func (c *cniNetworkConfigurator) Setup(ctx context.Context, alloc *structs.Allocation, spec *drivers.NetworkIsolationSpec, created bool) (*structs.AllocNetworkStatus, error) { + if err := c.ensureCNIInitialized(); err != nil { return nil, fmt.Errorf("cni not initialized: %w", err) } @@ -171,6 +172,16 @@ func (c *cniNetworkConfigurator) Setup(ctx context.Context, alloc *structs.Alloc cniArgs[ConsulIPTablesConfigEnvVar] = string(iptablesCfg) } + if !created { + err := c.cni.Check(ctx, alloc.ID, spec.Path, + c.nsOpts.withCapabilityPortMap(portMaps.ports), + c.nsOpts.withArgs(cniArgs), + ) + // return early: either the netns is ok or we need to start the alloc + // over anyways + return nil, err + } + // Depending on the version of bridge cni plugin used, a known race could occure // where two alloc attempt to create the nomad bridge at the same time, resulting // in one of them to fail. This rety attempts to overcome those erroneous failures.