Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

raw_exec: oom_score_adj support #23308

Merged
merged 13 commits into from
Jun 14, 2024
3 changes: 3 additions & 0 deletions .changelog/23308.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
raw_exec: Added support for oom_score_adj
```
9 changes: 9 additions & 0 deletions drivers/rawexec/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ var (
"args": hclspec.NewAttr("args", "list(string)", false),
"cgroup_v2_override": hclspec.NewAttr("cgroup_v2_override", "string", false),
"cgroup_v1_override": hclspec.NewAttr("cgroup_v1_override", "list(map(string))", false),
"oom_score_adj": hclspec.NewAttr("oom_score_adj", "number", false),
})

// capabilities is returned by the Capabilities RPC and indicates what
Expand Down Expand Up @@ -156,6 +157,9 @@ type TaskConfig struct {
//
// * All resource isolation guarantees are lost FOR ALL TASKS if set *
OverrideCgroupV1 hclutils.MapStrStr `codec:"cgroup_v1_override"`

// OOMScoreAdj sets the oom_score_adj on Linux systems
OOMScoreAdj int `codec:"oom_score_adj"`
}

// TaskState is the state which is encoded in the handle returned in
Expand Down Expand Up @@ -324,6 +328,10 @@ func (d *Driver) StartTask(cfg *drivers.TaskConfig) (*drivers.TaskHandle, *drive
return nil, nil, fmt.Errorf("failed to decode driver config: %v", err)
}

if driverConfig.OOMScoreAdj < 0 {
return nil, nil, fmt.Errorf("oom_score_adj must not be negative")
}

d.logger.Info("starting task", "driver_cfg", hclog.Fmt("%+v", driverConfig))
handle := drivers.NewTaskHandle(taskHandleVersion)
handle.Config = cfg
Expand Down Expand Up @@ -353,6 +361,7 @@ func (d *Driver) StartTask(cfg *drivers.TaskConfig) (*drivers.TaskHandle, *drive
Resources: cfg.Resources.Copy(),
OverrideCgroupV2: cgroupslib.CustomPathCG2(driverConfig.OverrideCgroupV2),
OverrideCgroupV1: driverConfig.OverrideCgroupV1,
OOMScoreAdj: int32(driverConfig.OOMScoreAdj),
}

// ensure only one of cgroups_v1_override and cgroups_v2_override have been
Expand Down
4 changes: 4 additions & 0 deletions drivers/shared/executor/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@ type ExecCommand struct {
//
// * All resource isolation guarantees are lost FOR ALL TASKS if set *
OverrideCgroupV1 map[string]string

// OOMScoreAdj allows setting oom_score_adj (likelihood of process being
// OOM killed) on Linux systems
OOMScoreAdj int32
}

func (c *ExecCommand) getCgroupOr(controller, fallback string) string {
Expand Down
10 changes: 4 additions & 6 deletions drivers/shared/executor/executor_universal_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ func (e *UniversalExecutor) statCG(cgroup string) (int, func(), error) {
func (e *UniversalExecutor) configureResourceContainer(command *ExecCommand, pid int) (func(), error) {
cgroup := command.StatsCgroup()

// ensure tasks do not inherit Nomad agent oom_score_adj value
if err := e.setOomAdj(); err != nil {
// ensure tasks get the desired oom_score_adj value set
if err := e.setOomAdj(command.OOMScoreAdj); err != nil {
return nil, err
}

Expand Down Expand Up @@ -280,12 +280,10 @@ func (e *UniversalExecutor) configureCG2(cgroup string, command *ExecCommand) {
_ = ed.Write("cpuset.cpus", cpusetCpus)
}

func (e *UniversalExecutor) setOomAdj() error {
// children should not inherit Nomad agent oom_score_adj value
//
func (e *UniversalExecutor) setOomAdj(oomScore int32) error {
// /proc/self/oom_score_adj should work on both cgroups v1 and v2 systems
// range is -1000 to 1000; 0 is the default
return os.WriteFile("/proc/self/oom_score_adj", []byte("0"), 0644)
return os.WriteFile("/proc/self/oom_score_adj", []byte(strconv.Itoa(int(oomScore))), 0644)
}

func (*UniversalExecutor) computeCPU(command *ExecCommand) uint64 {
Expand Down
28 changes: 28 additions & 0 deletions drivers/shared/executor/executor_universal_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ package executor

import (
"fmt"
"os"
"strconv"
"strings"
"testing"

"github.com/hashicorp/nomad/ci"
Expand Down Expand Up @@ -99,3 +102,28 @@ func TestExecutor_InvalidCgroup(t *testing.T) {
must.ErrorContains(t, err, "unable to configure cgroups: no such file or directory")

}

func TestUniversalExecutor_setOomAdj(t *testing.T) {
ci.Parallel(t)

factory := universalFactory
testExecCmd := testExecutorCommand(t)
execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir
execCmd.Cmd = "sleep"
execCmd.Args = []string{"infinity"}
execCmd.OOMScoreAdj = 1000

factory.configureExecCmd(t, execCmd)
defer allocDir.Destroy()
executor := factory.new(testlog.HCLogger(t), compute)
defer executor.Shutdown("", 0)

p, err := executor.Launch(execCmd)
must.NoError(t, err)

oomScore, err := os.ReadFile(fmt.Sprintf("/proc/%d/oom_score_adj", p.Pid))
must.NoError(t, err)

oomScoreInt, _ := strconv.Atoi(strings.TrimSuffix(string(oomScore), "\n"))
must.Eq(t, execCmd.OOMScoreAdj, int32(oomScoreInt))
}
1 change: 1 addition & 0 deletions drivers/shared/executor/grpc_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ func (c *grpcExecutorClient) Launch(cmd *ExecCommand) (*ProcessState, error) {
Capabilities: cmd.Capabilities,
CgroupV2Override: cmd.OverrideCgroupV2,
CgroupV1Override: cmd.OverrideCgroupV1,
OomScoreAdj: cmd.OOMScoreAdj,
}
resp, err := c.client.Launch(ctx, req)
if err != nil {
Expand Down
1 change: 1 addition & 0 deletions drivers/shared/executor/grpc_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ func (s *grpcExecutorServer) Launch(ctx context.Context, req *proto.LaunchReques
Capabilities: req.Capabilities,
OverrideCgroupV2: req.CgroupV2Override,
OverrideCgroupV1: req.CgroupV1Override,
OOMScoreAdj: req.OomScoreAdj,
})

if err != nil {
Expand Down
Loading
Loading