diff --git a/init/cli/lib.go b/init/cli/lib.go index 01536d0f..acdcbd78 100644 --- a/init/cli/lib.go +++ b/init/cli/lib.go @@ -110,6 +110,7 @@ func getConfiguredCommands(ctx cli.Context, loggers launchlib.ServiceLoggers) (m if err != nil { return nil, errors.Wrap(err, "failed to compile commands from static and custom configurations") } + cmds := make(map[string]CommandContext) cmds[staticConfig.ServiceName] = CommandContext{ serviceCmds.Primary, diff --git a/integration_test/go_java_launcher_integration_test.go b/integration_test/go_java_launcher_integration_test.go index b7b64820..c893f9a2 100644 --- a/integration_test/go_java_launcher_integration_test.go +++ b/integration_test/go_java_launcher_integration_test.go @@ -15,10 +15,12 @@ package integration_test import ( + "bytes" "log" "os" "os/exec" "path/filepath" + "regexp" "strconv" "strings" "syscall" @@ -74,33 +76,39 @@ func TestCreatesDirs(t *testing.T) { } func TestSubProcessesStoppedWhenMainDies(t *testing.T) { - cmd := mainWithArgs(t, "testdata/launcher-static-multiprocess.yml", "testdata/launcher-custom-multiprocess-long-sleep.yml") - require.NoError(t, cmd.Start()) - - // give launcher time to spawn sub-processes - time.Sleep(200 * time.Millisecond) - ppid := cmd.Process.Pid - children := childProcesses(t, ppid) - assert.Len(t, children, 2, "there should be one sub-process and one monitor") + cmd := mainWithArgs(t, "testdata/launcher-static-multiprocess.yml", "testdata/launcher-custom-multiprocess-long-sub-process.yml") + children := runMultiProcess(t, cmd) assert.NoError(t, cmd.Wait()) - time.Sleep(launchlib.CheckPeriod + 500*time.Millisecond) for _, pid := range children { - // This always succeeds on unix systems as it merely creates a process object - process, err := os.FindProcess(pid) - if err != nil { - continue - } + assert.False(t, launchlib.IsPidAlive(pid), "child was not killed") + } +} + +func TestSubProcessesParsedMonitorSignals(t *testing.T) { + cmd := mainWithArgs(t, "testdata/launcher-static-multiprocess.yml", "testdata/launcher-custom-multiprocess-long-sub-process.yml") + + output := &bytes.Buffer{} + cmd.Stdout = output - // Sending a signal of 0 checks the process exists, without actually sending a signal, - // see https://linux.die.net/man/2/kill - err = process.Signal(syscall.Signal(0)) - if err != nil { - continue + children := runMultiProcess(t, cmd) + var monitor int + for cmdLine, pid := range children { + if strings.Contains(cmdLine, "--group-monitor") { + monitor = pid + break } - assert.Fail(t, "child process was not killed", pid) } + + assert.NotZero(t, monitor, "no monitor pid found") + require.NoError(t, launchlib.SignalPid(monitor, syscall.SIGPOLL)) + + assert.NoError(t, cmd.Wait()) + + trapped, err := regexp.Compile("Caught SIGPOLL") + require.NoError(t, err) + assert.Len(t, trapped.FindAll(output.Bytes(), -1), 2, "expect two messages that SIGPOLL was caught") } func runMainWithArgs(t *testing.T, staticConfigFile, customConfigFile string) (string, error) { @@ -115,8 +123,14 @@ func mainWithArgs(t *testing.T, staticConfigFile, customConfigFile string) *exec return exec.Command(cli, staticConfigFile, customConfigFile) } -func childProcesses(t *testing.T, pid int) map[string]int { - command := exec.Command("/bin/ps", "-o", "pid,command", "--no-headers", "--ppid", strconv.Itoa(pid)) +func runMultiProcess(t *testing.T, cmd *exec.Cmd) map[string]int { + require.NoError(t, cmd.Start()) + + // let the launcher create the sub-processes + time.Sleep(500 * time.Millisecond) + ppid := cmd.Process.Pid + + command := exec.Command("/bin/ps", "-o", "pid,command", "--no-headers", "--ppid", strconv.Itoa(ppid)) output, err := command.CombinedOutput() require.NoError(t, err) @@ -132,6 +146,7 @@ func childProcesses(t *testing.T, pid int) map[string]int { } } + assert.Len(t, children, 2, "there should be one sub-process and one monitor") return children } diff --git a/integration_test/jdk/bin/java b/integration_test/jdk/bin/java index 52730680..259422d0 100755 --- a/integration_test/jdk/bin/java +++ b/integration_test/jdk/bin/java @@ -1,10 +1,15 @@ #!/bin/sh +trap 'echo "Caught SIGPOLL"' 29 + echo echo "main method" -if [ -z $SLEEP_TIME ]; then - sleep 5 -else - sleep $SLEEP_TIME -fi +# sleep stops execution of the current process including echoing that a signal was received +# breaking up the sleep gives time between sleeps for the trap to happen +SLEEP_TIME=${SLEEP_TIME:-5} +COUNTER=0 +while [ $COUNTER -lt $SLEEP_TIME ]; do + sleep 1 > /dev/null 2>&1 + COUNTER=$((COUNTER+1)) +done diff --git a/integration_test/testdata/launcher-custom-multiprocess-long-sub-process.yml b/integration_test/testdata/launcher-custom-multiprocess-long-sub-process.yml new file mode 100644 index 00000000..6e9ad3f0 --- /dev/null +++ b/integration_test/testdata/launcher-custom-multiprocess-long-sub-process.yml @@ -0,0 +1,11 @@ +configType: java +configVersion: 1 +jvmOpts: + - '-Xmx1g' +subProcesses: + sidecar: + configType: java + env: + SLEEP_TIME: "200" + jvmOpts: + - '-Xmx1g' diff --git a/launcher/main/main.go b/launcher/main/main.go index a92c84ef..faf2a3cc 100644 --- a/launcher/main/main.go +++ b/launcher/main/main.go @@ -35,25 +35,31 @@ func Exit1WithMessage(message string) { os.Exit(1) } -func ParseMonitorArgs(args []string) (*launchlib.ProcessMonitor, error) { +func CreateMonitorFromArgs(primaryPID string, subPIDs []string) (*launchlib.ProcessMonitor, error) { monitor := &launchlib.ProcessMonitor{} var err error - if monitor.PrimaryPID, err = strconv.Atoi(args[0]); err != nil { + if monitor.PrimaryPID, err = strconv.Atoi(primaryPID); err != nil { return nil, errors.Wrapf(err, "error parsing service pid") } - if monitor.ProcessGroupPID, err = strconv.Atoi(args[1]); err != nil { - return nil, errors.Wrapf(err, "error parsing service group id") + + for _, pidStr := range subPIDs { + pid, err := strconv.Atoi(pidStr) + if err != nil { + return nil, errors.Wrapf(err, "error parsing sub-process pid") + } + monitor.SubProcessPIDs = append(monitor.SubProcessPIDs, pid) } return monitor, nil } func GenerateMonitorArgs(monitor *launchlib.ProcessMonitor) []string { - return []string{ - monitorFlag, - strconv.Itoa(monitor.PrimaryPID), - strconv.Itoa(monitor.ProcessGroupPID), + args := make([]string, 0, len(monitor.SubProcessPIDs)+2) + args = append(args, monitorFlag, strconv.Itoa(monitor.PrimaryPID)) + for _, pid := range monitor.SubProcessPIDs { + args = append(args, strconv.Itoa(pid)) } + return args } func main() { @@ -62,23 +68,27 @@ func main() { stdout := os.Stdout switch numArgs := len(os.Args); { - case numArgs == 4 && os.Args[1] == monitorFlag: - if monitor, err := ParseMonitorArgs(os.Args[2:]); err != nil { + case numArgs > 3 && os.Args[1] == monitorFlag: + monitor, err := CreateMonitorFromArgs(os.Args[2], os.Args[3:]) + + if err != nil { fmt.Println("error parsing monitor args", err) - Exit1WithMessage(fmt.Sprintf("Usage: go-java-launcher %s ", monitorFlag)) - } else if err = monitor.TermProcessGroupOnDeath(); err != nil { + Exit1WithMessage(fmt.Sprintf("Usage: go-java-launcher %s ", monitorFlag)) + } + + if err = monitor.Run(); err != nil { fmt.Println("error running process monitor", err) Exit1WithMessage("process monitor failed") } return - case numArgs > 3: - Exit1WithMessage("Usage: go-java-launcher " + - "[]") case numArgs == 2: staticConfigFile = os.Args[1] case numArgs == 3: staticConfigFile = os.Args[1] customConfigFile = os.Args[2] + default: + Exit1WithMessage("Usage: go-java-launcher " + + "[]") } // Read configuration @@ -109,51 +119,43 @@ func main() { } if len(cmds.SubProcesses) != 0 { - // For this process (referenced as 0), set the process group id to our pid (also referenced as 0), to - // ensure we are in our own group. - if err := syscall.Setpgid(0, 0); err != nil { - fmt.Printf("Unable to create process group for primary with subProcesses") - panic(err) - } - - pgid := syscall.Getpgrp() monitor := &launchlib.ProcessMonitor{ - PrimaryPID: os.Getpid(), - ProcessGroupPID: pgid, - } - monitorCmd := exec.Command(os.Args[0], GenerateMonitorArgs(monitor)...) - monitorCmd.Stdout = os.Stdout - monitorCmd.Stderr = os.Stderr - - // From this point, if the launcher, or subsequent primary process dies, the process group will be - // terminated by the process monitor - fmt.Println("Starting process monitor for service process group ", pgid) - if err := monitorCmd.Start(); err != nil { - fmt.Println("Failed to start process monitor for service process group") - panic(err) + PrimaryPID: os.Getpid(), + SubProcessPIDs: nil, } + // From this point, any errors in the launcher will cause all of the created sub-processes to also die, + // once the main process is exec'ed, this defer will no longer apply, and the external monitor assumes + // responsibility for killing the sub-processes. + defer func() { + if err := monitor.KillSubProcesses(); err != nil { + // Defer only called if failure complete exec of the primary process, so already panicking + fmt.Println("error cleaning up sub-processes", err) + } + }() for name, subProcess := range cmds.SubProcesses { - // Create struct if not present, as to not override previously set SysProcAttr properties - if subProcess.SysProcAttr == nil { - subProcess.SysProcAttr = &syscall.SysProcAttr{} - } - // Do not set the pgid of the subProcesses, leaving them in the same process group as this one - subProcess.SysProcAttr.Setpgid = false subProcess.Stdout = os.Stdout subProcess.Stderr = os.Stderr fmt.Println("Starting subProcesses ", name, subProcess.Path) if execErr := subProcess.Start(); execErr != nil { if os.IsNotExist(execErr) { - fmt.Printf("Executable not found for subProcess %s at: %s\n", name, - subProcess.Path) + fmt.Printf("Executable not found for subProcess %s at: %s\n", name, subProcess.Path) } panic(execErr) - } else { - fmt.Printf("Started subProcess %s under process pid %d\n", name, - subProcess.Process.Pid) } + monitor.SubProcessPIDs = append(monitor.SubProcessPIDs, subProcess.Process.Pid) + fmt.Printf("Started subProcess %s under process pid %d\n", name, subProcess.Process.Pid) + } + + monitorCmd := exec.Command(os.Args[0], GenerateMonitorArgs(monitor)...) + monitorCmd.Stdout = os.Stdout + monitorCmd.Stderr = os.Stderr + + fmt.Println("Starting process monitor for service process ", monitor.PrimaryPID) + if err := monitorCmd.Start(); err != nil { + fmt.Println("Failed to start process monitor for service process") + panic(err) } } diff --git a/launchlib/monitor.go b/launchlib/monitor.go index 0fc2106b..73d3aba7 100755 --- a/launchlib/monitor.go +++ b/launchlib/monitor.go @@ -15,7 +15,9 @@ package launchlib import ( + "fmt" "os" + "os/signal" "syscall" "time" @@ -27,21 +29,42 @@ const ( ) type ProcessMonitor struct { - PrimaryPID int - ProcessGroupPID int + PrimaryPID int + SubProcessPIDs []int } -func (m *ProcessMonitor) TermProcessGroupOnDeath() error { +func (m *ProcessMonitor) Run() error { if err := m.verify(); err != nil { return err } + m.ForwardSignals() + return m.TermProcessGroupOnDeath() +} + +func (m *ProcessMonitor) ForwardSignals() { + signals := make(chan os.Signal, 35) + signal.Notify(signals) + + go func() { + for { + select { + case sign := <-signals: + // Errors are already printed and there is no where else relevant to return them to. + _ = SignalPid(m.PrimaryPID, sign) + _ = m.SignalSubProcesses(sign) + } + } + }() +} + +func (m *ProcessMonitor) TermProcessGroupOnDeath() error { tick := time.NewTicker(CheckPeriod) alive := true for { select { case <-tick.C: - alive = m.isAlive() + alive = IsPidAlive(m.PrimaryPID) } if !alive { tick.Stop() @@ -49,39 +72,68 @@ func (m *ProcessMonitor) TermProcessGroupOnDeath() error { } } - // Service process has died, terminating process group - if err := syscall.Kill(-m.ProcessGroupPID, syscall.SIGTERM); err != nil { - return errors.Wrapf(err, "unable to send term signal to process group, beware of orphaned subProcesses") + return m.KillSubProcesses() +} + +func (m *ProcessMonitor) KillSubProcesses() error { + return m.SignalSubProcesses(syscall.SIGTERM) +} + +func (m *ProcessMonitor) SignalSubProcesses(sign os.Signal) error { + // Service process has died, terminating sub-processes + var errPids []int + for _, pid := range m.SubProcessPIDs { + if err := SignalPid(pid, sign); err != nil { + errPids = append(errPids, pid) + } + } + + if len(errPids) > 0 { + return errors.Errorf("unable to kill sub-processes for pids %s", errPids) } return nil } func (m *ProcessMonitor) verify() error { - if syscall.Getpgrp() != m.ProcessGroupPID { - return errors.Errorf("ProcessMonitor is part of process group '%s' not service process group '%s'. "+ - "ProcessMonitor is expected to only be used by the go-java-launcher itself, under the same "+ - "process as the service", syscall.Getpgrp(), m.ProcessGroupPID) + if os.Getppid() != m.PrimaryPID { + return errors.Errorf("ProcessMonitor is a sub-process of '%s' not service primary process '%s'. "+ + "ProcessMonitor is expected to only be used by the go-java-launcher itself, under the same process as the"+ + " service", os.Getppid(), m.PrimaryPID) } - if m.ProcessGroupPID == 1 { - return errors.New("ProcessMonitor service group given is '1', refusing to monitor services under " + - "init process group") - } return nil } -func (m *ProcessMonitor) isAlive() bool { +func IsPidAlive(pid int) bool { // This always succeeds on unix systems as it merely creates a process object - process, err := os.FindProcess(m.PrimaryPID) + process, err := os.FindProcess(pid) if err != nil { return false } + return IsProcessAlive(process) +} + +func IsProcessAlive(process *os.Process) bool { // Sending a signal of 0 checks the process exists, without actually sending a signal, // see https://linux.die.net/man/2/kill - err = process.Signal(syscall.Signal(0)) + err := process.Signal(syscall.Signal(0)) if err != nil { return false } return true } + +func SignalPid(pid int, sign os.Signal) error { + process, err := os.FindProcess(pid) + if err != nil || !IsProcessAlive(process) { + fmt.Printf("Sub-process %d is dead\n", pid) + return nil + } + + if err := process.Signal(sign); err != nil { + fmt.Println("error signalling sub-process", pid, err, sign) + return err + } + return nil +}