Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get rid of docker pause containers with a custom runtime #20017

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions command/commands.go
Original file line number Diff line number Diff line change
Expand Up @@ -976,6 +976,9 @@ func Commands(metaPtr *Meta, agentUi cli.Ui) map[string]cli.CommandFactory {
Meta: meta,
}, nil
},
"runc": func() (cli.Command, error) {
return &RuncCommand{}, nil
},
apollo13 marked this conversation as resolved.
Show resolved Hide resolved
"scaling": func() (cli.Command, error) {
return &ScalingCommand{
Meta: meta,
Expand Down
78 changes: 78 additions & 0 deletions command/runc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1

package command

import (
"encoding/json"
"fmt"
"io"
"os"
"slices"
"syscall"
)

type RuncCommand struct {
}

func (c *RuncCommand) Help() string {
return ""
}

func (c *RuncCommand) Name() string { return "runc" }

func (c *RuncCommand) Run(args []string) int {
if slices.Contains(args, "create") && slices.Contains(args, "--bundle") {
bundle := args[slices.Index(args, "--bundle")+1]
apollo13 marked this conversation as resolved.
Show resolved Hide resolved

jsonFile, _ := os.Open(fmt.Sprintf("%s/config.json", bundle))
byteValue, _ := io.ReadAll(jsonFile)
jsonFile.Close()

var config map[string]interface{}
apollo13 marked this conversation as resolved.
Show resolved Hide resolved
err := json.Unmarshal([]byte(byteValue), &config)

Check failure on line 33 in command/runc.go

View workflow job for this annotation

GitHub Actions / checks / checks

unnecessary conversion (unconvert)
if err != nil {
return 1
}

linuxConfig := config["linux"].(map[string]interface{})
if config["annotations"] == nil {
goto exec // Nothing to do
}
annotations := config["annotations"].(map[string]interface{})
if annotations["network_ns"] == nil {
goto exec // Nothing to do
}
namespaces := linuxConfig["namespaces"].([]interface{})
// If there is a network namespace, modify it
foundNetworkNamespace := false
for i, v := range namespaces {
extractedValue := v.(map[string]interface{})
if extractedValue["type"] == "network" {
extractedValue["path"] = annotations["network_ns"]
namespaces[i] = extractedValue
foundNetworkNamespace = true
break
}
}
// if not add one
if !foundNetworkNamespace {
namespace := map[string]interface{}{"type": "network", "path": annotations["network_ns"]}
namespaces = append(namespaces, namespace)
}
linuxConfig["namespaces"] = namespaces
config["linux"] = linuxConfig

jsonBytes, _ := json.Marshal(config)
os.WriteFile("/tmp/config.json", jsonBytes, 0600)
os.WriteFile(fmt.Sprintf("%s/config.json", bundle), jsonBytes, 0600)
}
exec:
apollo13 marked this conversation as resolved.
Show resolved Hide resolved
args = append([]string{"runc"}, args...)
syscall.Exec("/usr/bin/runc", args, os.Environ())
return 0
}

func (c *RuncCommand) Synopsis() string {
return ""
}
6 changes: 6 additions & 0 deletions drivers/docker/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,8 @@ var (
// task containers. If true, nomad doesn't start docker_logger/logmon processes
"disable_log_collection": hclspec.NewAttr("disable_log_collection", "bool", false),

"new_networking": hclspec.NewAttr("new_networking", "bool", false),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should bikeshed over this field name some. But "no pause container" with a default of false makes for an awkward double-negative so I appreciate you're trying to avoid that here. 😁

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You read my mind… Also I am not sure if the default (if omitted) wouldn't be always false and then it would be no pause container = false which would be the default we don't want? Or can we express that better in hcl (bool reference to allow checking for null?).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Renamed to nomad_native_networking for the lack of a better name, feel free to change at will.


// windows_allow_insecure_container_admin indicates that on windows,
// docker checks the task.user field or, if unset, the container image
// manifest after pulling the container, to see if it's running as
Expand Down Expand Up @@ -675,6 +677,7 @@ type DriverConfig struct {
infraImagePullTimeoutDuration time.Duration `codec:"-"`
ContainerExistsAttempts uint64 `codec:"container_exists_attempts"`
DisableLogCollection bool `codec:"disable_log_collection"`
NewNetworking bool `codec:"new_networking"`
PullActivityTimeout string `codec:"pull_activity_timeout"`
PidsLimit int64 `codec:"pids_limit"`
pullActivityTimeoutDuration time.Duration `codec:"-"`
Expand Down Expand Up @@ -828,5 +831,8 @@ func (d *Driver) TaskConfigSchema() (*hclspec.Spec, error) {
// features this driver supports.
func (d *Driver) Capabilities() (*drivers.Capabilities, error) {
driverCapabilities.DisableLogCollection = d.config != nil && d.config.DisableLogCollection
if d.config != nil {
driverCapabilities.MustInitiateNetwork = !d.config.NewNetworking
}
return driverCapabilities, nil
}
23 changes: 19 additions & 4 deletions drivers/docker/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -990,6 +990,13 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T
if _, ok := d.config.allowRuntimes[containerRuntime]; !ok && containerRuntime != "" {
return c, fmt.Errorf("requested runtime %q is not allowed", containerRuntime)
}
if d.config.NewNetworking {
if containerRuntime == "" {
containerRuntime = "nomad"
} else {
return c, fmt.Errorf("new-style networking not compatible with custom runtimess")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking about this a bit and I'm not sure that's inevitable? What if the Nomad runtime function took an argument that was the binary to call next (defaulting to runc) along with args to transparently pass-thru? We likely need that anyway because we can't rely on the location of runc being at a specific path like /usr/bin/runc as we've done here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So that is just me being lazy. In practice, the docker api supports querying the registered runtimes (run docker info on your box and you will see them, so it must be part of the API). We could query that and then pass the "sub"-runtime through via annotations like with the network ns (we have to use annotations since it is docker calling the runtime and we cannot pass arbritrary args)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay we cannot pass on the binary to call next via annotations since the oci runtime operations like state & delete only pass in a container ID. So the only thing we can do is force the enduser to specify something like this in docker daemon.json:

{
	"runtimes": {
          "nomad": {"path": "/home/florian/sources/nomad/bin/nomad",
                             "runtimeArgs": ["runc", "/usr/bin/runc"]},
          "nomad-nvidia": {"path": "/home/florian/sources/nomad/bin/nomad",
                             "runtimeArgs": ["runc", "/usr/bin/nvidia-runtime"]},
       }
}

and then we can just prefix nomad- to the requested runtime if the "new" networking is enabled. Not really nice but probably good enough.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a commit that prefixes other runtimes with nomad-*. Should we consider the interaction with allow_runtimes? I am leaning towards a no since those nomad-* runtimes have to be configured explicitly anyways?

}
}

// Validate isolation modes on windows
if runtime.GOOS != "windows" {
Expand Down Expand Up @@ -1038,6 +1045,8 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T

Runtime: containerRuntime,
GroupAdd: driverConfig.GroupAdd,

Annotations: map[string]string{},
}

hostConfig.Resources = containerapi.Resources{
Expand Down Expand Up @@ -1285,10 +1294,16 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T
// shared alloc network
if hostConfig.NetworkMode == "" {
if task.NetworkIsolation != nil && task.NetworkIsolation.Path != "" {
// find the previously created parent container to join networks with
netMode := fmt.Sprintf("container:%s", task.NetworkIsolation.Labels[dockerNetSpecLabelKey])
logger.Debug("configuring network mode for task group", "network_mode", netMode)
hostConfig.NetworkMode = containerapi.NetworkMode(netMode)
if d.config.NewNetworking {
// "host" is not actually true here, it will cause joining the existing namespace
hostConfig.NetworkMode = "host"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might still be better to use "none" here and test if it still works? But the differences are probably marginal. docker network inspect host/none will show the containers either way, even if they are not really attached.

hostConfig.Annotations["network_ns"] = task.NetworkIsolation.Path
} else {
// find the previously created parent container to join networks with
netMode := fmt.Sprintf("container:%s", task.NetworkIsolation.Labels[dockerNetSpecLabelKey])
logger.Debug("configuring network mode for task group", "network_mode", netMode)
hostConfig.NetworkMode = containerapi.NetworkMode(netMode)
}
} else {
// docker default
logger.Debug("networking mode not specified; using default")
Expand Down
Loading