Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: run housekeeping, prolog, epilog in the flux systemd instance #6662

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -651,10 +651,7 @@ AC_CONFIG_FILES( \
etc/flux-hostlist.pc \
etc/flux-taskmap.pc \
etc/flux.service \
etc/[email protected] \
src/cmd/flux-run-housekeeping \
etc/[email protected] \
etc/[email protected] \
src/cmd/flux-run-prolog \
src/cmd/flux-run-epilog \
doc/Makefile \
Expand Down
5 changes: 1 addition & 4 deletions etc/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
#if HAVE_SYSTEMD
systemdsystemunit_DATA = \
flux.service \
[email protected] \
[email protected] \
[email protected]
flux.service
#endif

tmpfilesdir = $(prefix)/lib/tmpfiles.d
Expand Down
9 changes: 0 additions & 9 deletions etc/[email protected]

This file was deleted.

18 changes: 0 additions & 18 deletions etc/[email protected]

This file was deleted.

9 changes: 0 additions & 9 deletions etc/[email protected]

This file was deleted.

22 changes: 4 additions & 18 deletions src/cmd/flux-run-epilog.in
Original file line number Diff line number Diff line change
@@ -1,21 +1,7 @@
#!/bin/sh

if test $FLUX_JOB_ID; then
FLUX_JOB_ID=$(flux job id --to=f58plain $FLUX_JOB_ID)
fi
unitname=flux-epilog@${FLUX_JOB_ID:-unknown}
# This script exists for transition purposes and will be removed.
# Please reconfigure the IMP [run.epilog] table to use
# path = "@X_SYSCONFDIR@/flux/system/epilog".

terminate() {
systemctl stop $unitname
exit 1
}

trap terminate INT TERM

umask 022
printenv >@X_RUNSTATEDIR@/${unitname}.env

# Run systemctl start in background and `wait` for it so that the trap
# will run immediately when signal is received:
systemctl start $unitname --quiet &
wait $!
exec @X_SYSCONFDIR@/flux/system/epilog
22 changes: 4 additions & 18 deletions src/cmd/flux-run-housekeeping.in
Original file line number Diff line number Diff line change
@@ -1,21 +1,7 @@
#!/bin/sh

if test $FLUX_JOB_ID; then
FLUX_JOB_ID=$(flux job id --to=f58plain $FLUX_JOB_ID)
fi
unitname=flux-housekeeping@${FLUX_JOB_ID:-unknown}
# This script exists for transition purposes and will be removed.
# Please reconfigure the IMP [run.housekeeping] table to use
# path = "@X_SYSCONFDIR@/flux/system/housekeeping".

terminate() {
systemctl stop $unitname
exit 1
}

trap terminate INT TERM

umask 022
printenv >@X_RUNSTATEDIR@/${unitname}.env

# Run systemctl start in background and `wait` for it so that the trap
# will run immediately when signal is received:
systemctl start $unitname --quiet &
wait $!
exec @X_SYSCONFDIR@/flux/system/housekeeping
22 changes: 4 additions & 18 deletions src/cmd/flux-run-prolog.in
Original file line number Diff line number Diff line change
@@ -1,21 +1,7 @@
#!/bin/sh

if test $FLUX_JOB_ID; then
FLUX_JOB_ID=$(flux job id --to=f58plain $FLUX_JOB_ID)
fi
unitname=flux-prolog@${FLUX_JOB_ID:-unknown}
# This script exists for transition purposes and will be removed.
# Please reconfigure the IMP [run.prolog] table to use
# path = "@X_SYSCONFDIR@/flux/system/prolog".

terminate() {
systemctl stop $unitname
exit 1
}

trap terminate INT TERM

umask 022
printenv >@X_RUNSTATEDIR@/${unitname}.env

# Run systemctl start in background and `wait` for it so that the trap
# will run immediately when signal is received:
systemctl start $unitname --quiet &
wait $!
exec @X_SYSCONFDIR@/flux/system/prolog
115 changes: 83 additions & 32 deletions src/modules/job-manager/housekeeping.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
#include "src/common/libhostlist/hostlist.h"
#include "src/common/libutil/fsd.h"
#include "src/common/libutil/errprintf.h"
#include "src/common/libutil/errno_safe.h"
#include "src/common/libjob/idf58.h"
#include "src/common/libsubprocess/bulk-exec.h"
#include "src/common/libsubprocess/command.h"
Expand All @@ -107,6 +108,8 @@
// -1 = never, 0 = immediate, >0 = time in seconds
static const double default_release_after = -1;

static const char *default_exec_service = "rexec";

struct allocation {
flux_jobid_t id;
struct rlist *rl; // R, diminished each time a subset is released
Expand All @@ -119,13 +122,15 @@
double t_start;
struct bulk_exec *bulk_exec;
void *list_handle;
int kill_signal;
};

struct housekeeping {
struct job_manager *ctx;
flux_cmd_t *cmd; // NULL if not configured
char *exec_service;
double release_after;
char *imp_path;
int kill_signal;
zlistx_t *allocations;
flux_msg_handler_t **handlers;
};
Expand Down Expand Up @@ -181,6 +186,8 @@
return NULL;
a->hk = hk;
a->id = id;
a->kill_signal = hk->kill_signal;

a->t_start = flux_reactor_now (flux_get_reactor (hk->ctx->h));
if (!(a->rl = rlist_from_json (R, NULL))
|| !(a->pending = rlist_ranks (a->rl))
Expand All @@ -191,9 +198,9 @@
allocation_timeout,
a))
|| !(a->bulk_exec = bulk_exec_create (&bulk_ops,
"rexec",
hk->exec_service,
id,
"housekeeping",
"flux-housekeeping",
a))
|| update_cmd_env (hk->cmd, id, userid) < 0
|| bulk_exec_push_cmd (a->bulk_exec, a->pending, hk->cmd, 0) < 0) {
Expand Down Expand Up @@ -698,7 +705,10 @@
while (a) {
if (a->id == jobid || jobid == FLUX_JOBID_ANY) {
if (a->bulk_exec) {
f = bulk_exec_kill (a->bulk_exec, ids, signum);
int sig = signum;
if (sig == SIGKILL)
sig = a->kill_signal;

Check warning on line 710 in src/modules/job-manager/housekeeping.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-manager/housekeeping.c#L710

Added line #L710 was not covered by tests
f = bulk_exec_kill (a->bulk_exec, ids, sig);
if (flux_future_then (f, -1, kill_continuation, hk) < 0)
flux_future_destroy (f);
}
Expand Down Expand Up @@ -742,6 +752,43 @@
return cmd;
}

/* Create the housekeeping command template based on parsed configuration.
* If no 'cmdline' was configured, assume "imp run housekeeping".
* In addition, if the IMP will run as the main process in a transient
* systemd unit, set options for correct signal forwarding semantics.
* 'kill_signal' is assigned (only) if a SIGKILL proxy signal is needed.
*/
static flux_cmd_t *create_cmd_template (json_t *cmdline,
const char *exec_service,
const char *imp_path,
int *kill_signal)
{
flux_cmd_t *cmd = NULL;
json_t *o = NULL;
bool use_imp = false;

if (!cmdline) {
if (!(o = json_pack ("[sss]", imp_path, "run", "housekeeping")))
return NULL;
cmdline = o;
use_imp = true;
}
if (!(cmd = create_cmd (cmdline)))
goto error;

Check warning on line 777 in src/modules/job-manager/housekeeping.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-manager/housekeeping.c#L777

Added line #L777 was not covered by tests
if (use_imp && streq (exec_service, "sdexec")) {
if (flux_cmd_setopt (cmd, "SDEXEC_PROP_KillMode", "process") < 0
|| flux_cmd_setopt (cmd, "SDEXEC_PROP_SendSIGKILL", "off") < 0)
goto error;
*kill_signal = SIGUSR1;

Check warning on line 782 in src/modules/job-manager/housekeeping.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-manager/housekeeping.c#L779-L782

Added lines #L779 - L782 were not covered by tests
}
json_decref (o);
return cmd;
error:
flux_cmd_destroy (cmd);
ERRNO_SAFE_WRAP (json_decref, o);
return NULL;

Check warning on line 789 in src/modules/job-manager/housekeeping.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-manager/housekeeping.c#L786-L789

Added lines #L786 - L789 were not covered by tests
}

static int housekeeping_parse_config (const flux_conf_t *conf,
flux_error_t *error,
void *arg)
Expand All @@ -755,8 +802,10 @@
double release_after = default_release_after;
flux_cmd_t *cmd = NULL;
const char *imp_path = NULL;
char *imp_path_cpy = NULL;
int use_systemd_unit = 0;
const char *exec_service = default_exec_service;
char *exec_service_cpy = NULL;
int kill_signal = SIGKILL;

if (flux_conf_unpack (conf,
&e,
Expand Down Expand Up @@ -785,8 +834,19 @@
" - ignoring");
}

// let job-exec handle exec errors
(void)flux_conf_unpack (conf, NULL, "{s?{s?s}}", "exec", "imp", &imp_path);
// let job-exec handle exec parse errors
(void)flux_conf_unpack (conf,
NULL,
"{s?{s?s s?s}}",
"exec",
"imp", &imp_path,
"service", &exec_service);

if (!cmdline && !imp_path) {
return errprintf (error,
"job-manager.housekeeping implies IMP"
" but exec.imp is undefined");
}

if (release_after_fsd) {
if (fsd_parse_duration (release_after_fsd, &release_after) < 0)
Expand All @@ -795,40 +855,28 @@
" FSD parse error");
}

if (cmdline) {
if (!(cmd = create_cmd (cmdline)))
return errprintf (error, "error creating housekeeping command");
if (!(exec_service_cpy = strdup (exec_service)))
return errprintf (error, "error duplicating exec service");

Check warning on line 859 in src/modules/job-manager/housekeeping.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-manager/housekeeping.c#L859

Added line #L859 was not covered by tests
if (!(cmd = create_cmd_template (cmdline,
exec_service,
imp_path,
&kill_signal))) {
ERRNO_SAFE_WRAP (free, exec_service_cpy);
return errprintf (error, "could not create command template");

Check warning on line 865 in src/modules/job-manager/housekeeping.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-manager/housekeeping.c#L864-L865

Added lines #L864 - L865 were not covered by tests
}

// if no command line was defined, assume "imp exec housekeeping"
else {
if (!imp_path) {
return errprintf (error,
"job-manager.housekeeping implies IMP"
" but exec.imp is undefined");
}
json_t *o;
if ((o = json_pack ("[sss]", imp_path, "run", "housekeeping")))
cmd = create_cmd (o);
json_decref (o);
if (!cmd)
return errprintf (error, "error creating housekeeping command");
if (!(imp_path_cpy = strdup (imp_path))) {
flux_cmd_destroy (cmd);
return errprintf (error, "error duplicating IMP path");
}
}
done:
flux_cmd_destroy (hk->cmd);
hk->cmd = cmd;
free (hk->imp_path);
hk->imp_path = imp_path_cpy;
free (hk->exec_service);
hk->exec_service = exec_service_cpy;
hk->release_after = release_after;
hk->kill_signal = kill_signal;
flux_log (hk->ctx->h,
LOG_DEBUG,
"housekeeping is %sconfigured%s",
hk->cmd ? "" : "not ",
hk->imp_path ? " with IMP" : "");
(imp_path && !cmdline) ? " with IMP" : "");
return 1; // allow dynamic changes
}

Expand All @@ -850,7 +898,7 @@
flux_cmd_destroy (hk->cmd);
zlistx_destroy (&hk->allocations);
flux_msg_handler_delvec (hk->handlers);
free (hk->imp_path);
free (hk->exec_service);
free (hk);
errno = saved_errno;
}
Expand All @@ -865,10 +913,13 @@
return NULL;
hk->ctx = ctx;
hk->release_after = default_release_after;
if (!(hk->exec_service = strdup (default_exec_service)))
goto error;

Check warning on line 917 in src/modules/job-manager/housekeeping.c

View check run for this annotation

Codecov / codecov/patch

src/modules/job-manager/housekeeping.c#L917

Added line #L917 was not covered by tests
if (!(hk->allocations = zlistx_new ())) {
errno = ENOMEM;
goto error;
}
hk->kill_signal = SIGKILL;
zlistx_set_destructor (hk->allocations, allocation_destructor);
if (conf_register_callback (ctx->conf,
&error,
Expand Down
Loading
Loading