Skip to content

Commit

Permalink
Merge pull request flux-framework#4990 from grondo/issue#4987
Browse files Browse the repository at this point in the history
prevent orphaned job processes when terminating jobs due to exception
  • Loading branch information
mergify[bot] authored Mar 29, 2023
2 parents 609c2ee + 5a23713 commit 3b2562e
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 5 deletions.
37 changes: 35 additions & 2 deletions src/modules/job-exec/job-exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ void jobinfo_decref (struct jobinfo *job)
idset_destroy (job->critical_ranks);
eventlogger_destroy (job->ev);
flux_watcher_destroy (job->kill_timer);
flux_watcher_destroy (job->kill_shell_timer);
flux_watcher_destroy (job->expiration_timer);
zhashx_delete (job->ctx->jobs, &job->id);
if (job->impl && job->impl->exit)
Expand Down Expand Up @@ -355,29 +356,61 @@ static void jobinfo_complete (struct jobinfo *job, const struct idset *ranks)
}
}

static void kill_shell_timer_cb (flux_reactor_t *r,
flux_watcher_t *w,
int revents,
void *arg)
{
struct jobinfo *job = arg;
flux_log (job->h,
LOG_DEBUG,
"Sending SIGKILL to job shell for job %ju",
(uintmax_t) job->id);
(*job->impl->kill) (job, SIGKILL);
}

static void kill_timer_cb (flux_reactor_t *r, flux_watcher_t *w,
int revents, void *arg)
{
struct jobinfo *job = arg;
flux_future_t *f;
flux_log (job->h,
LOG_DEBUG,
"Sending SIGKILL to job %ju",
(uintmax_t) job->id);
(*job->impl->kill) (job, SIGKILL);
if (!(f = flux_job_kill (job->h, job->id, SIGKILL))) {
flux_log_error (job->h,
"flux_job_kill (%ju, SIGKILL)",
job->id);
return;
}
/* Open loop */
flux_future_destroy (f);
}


static void jobinfo_killtimer_start (struct jobinfo *job, double after)
{
flux_reactor_t *r = flux_get_reactor (job->h);

/* Only start kill timer if not already running */
if (job->kill_timer == NULL) {
job->kill_timer = flux_timer_watcher_create (flux_get_reactor (job->h),
job->kill_timer = flux_timer_watcher_create (r,
after,
0.,
kill_timer_cb,
job);
flux_watcher_start (job->kill_timer);
}
if (job->kill_shell_timer == NULL) {
job->kill_shell_timer = flux_timer_watcher_create (r,
after*5,
0.,
kill_shell_timer_cb,
job);
flux_watcher_start (job->kill_shell_timer);
}

}

static void timelimit_cb (flux_reactor_t *r,
Expand Down
1 change: 1 addition & 0 deletions src/modules/job-exec/job-exec.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ struct jobinfo {

double kill_timeout; /* grace time between sigterm,kill */
flux_watcher_t *kill_timer;
flux_watcher_t *kill_shell_timer;
flux_watcher_t *expiration_timer;

/* Exec implementation for this job */
Expand Down
4 changes: 2 additions & 2 deletions src/modules/job-manager/kill.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ void kill_handle_request (flux_t *h,
errstr = "guests may only send signals to their own jobs";
goto error;
}
if (job->state != FLUX_JOB_STATE_RUN) {
if (!(job->state & FLUX_JOB_STATE_RUNNING)) {
errstr = "job is not running";
errno = EINVAL;
goto error;
Expand Down Expand Up @@ -160,7 +160,7 @@ void killall_handle_request (flux_t *h,
}
job = zhashx_first (ctx->active_jobs);
while (job) {
if (job->state != FLUX_JOB_STATE_RUN)
if (!(job->state & FLUX_JOB_STATE_RUNNING))
goto next;
if (userid != FLUX_USERID_UNKNOWN && userid != job->userid)
goto next;
Expand Down
1 change: 1 addition & 0 deletions t/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ TESTSCRIPTS = \
t2403-job-exec-conf.t \
t2404-job-exec-multiuser.t \
t2405-job-exec-sdexec.t \
t2406-job-exec-cleanup.t \
t2410-exec-systemd.t \
t2500-job-attach.t \
t2501-job-status.t \
Expand Down
15 changes: 14 additions & 1 deletion t/t2402-job-exec-dummy.t
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,20 @@ test_expect_success 'job-exec: job exception uses SIGKILL after kill-timeout' '
flux cancel $id &&
(flux job attach -vEX $id >kill.output 2>&1 || true) &&
test_debug "cat kill.output" &&
grep "trap-sigterm got SIGTERM" kill.output &&
grep "trap-sigterm got SIGTERM" kill.output
'
test_expect_success 'job-exec: job shell eventually killed by SIGKILL' '
id=$(flux submit --wait-event=start -n1 \
sh -c "trap \"\" SIGTERM;
flux kvs put ready=1;
while true; do sleep 1; done") &&
flux kvs get --waitcreate \
--namespace=$(flux job namespace $id) \
ready &&
flux cancel $id &&
flux job wait-event -vt 15 $id clean &&
flux dmesg | grep $(flux job id $id) &&
test_expect_code 137 flux job status $id &&
flux module reload job-exec
'
test_expect_success 'job-exec: invalid job shell generates exception' '
Expand Down
28 changes: 28 additions & 0 deletions t/t2406-job-exec-cleanup.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/sh

test_description='Test flux job exec job cleanup via SIGKILL'

. $(dirname $0)/sharness.sh

test_under_flux 1 job

test_expect_success 'job-exec: reload module with short kill-timeout' '
flux module reload job-exec kill-timeout=0.1s
'
test_expect_success 'job-exec: run test program that blocks SIGTERM' '
id=$(flux submit --wait-event=start -n 1 -o trap.out \
sh -c "trap \"echo got SIGTERM\" 15; \
flux kvs put pid=\$\$; \
sleep inf; sleep inf") &&
ns=$(flux job namespace $id) &&
pid=$(flux kvs get -WN ${ns} ${dir}.pid) &&
test_debug "echo script running as pid=$pid"
'
test_expect_success 'job-exec: ensure cancellation kills job' '
test_debug "echo Canceling $id" &&
flux cancel $id &&
test_debug "flux job attach -vEX $id || :" &&
test_expect_code 137 flux job status $id &&
test_must_fail ps -q $pid
'
test_done

0 comments on commit 3b2562e

Please sign in to comment.