From 044b416af5339a278543b4263fb102fb1f2f5708 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Tue, 1 Sep 2020 05:13:41 +0100 Subject: [PATCH] Improved handling of SIGTERM (#660) * Return exit code 15 (SIGTERM) after SIGTERM. When marian receives signal SIGTERM and exits gracefully (save model & exit), it should then exit with a non-zero exit code, to signal to any parent process that it did not exit "naturally". * Added explanatory comment about exiting marian_train with non-zero status after SIGTERM. * Bug fix: better handling of SIGTERM for graceful shutdown during training. Prior to this bug fix, BatchGenerator::fetchBatches, which runs in a separate thread, would ignore SIGTERM during training (training uses a custom signal handler for SIGTERM, which simply sets a global flag, to enable graceful shutdown (i.e., save models and current state of training before shutting down). The changes in this commit also facilitate custom handling of other signals in the future by providing a general singal handler for all signals with a signal number below 32 (setSignalFlag) and a generic flag checking function (getSignalFlag(sig)) for checking such flags. --- CHANGELOG.md | 1 + src/CMakeLists.txt | 2 +- src/command/marian_train.cpp | 11 +++---- src/common/config_parser.cpp | 9 ++++++ src/common/signal_handling.cpp | 58 ++++++++++++++++++++++++++++++++++ src/common/signal_handling.h | 39 +++++++++++++++++++++++ src/data/batch_generator.h | 9 ++++-- src/training/scheduler.cpp | 43 ------------------------- src/training/scheduler.h | 19 +++++------ src/training/training.h | 16 ++++++++++ 10 files changed, 144 insertions(+), 63 deletions(-) create mode 100644 src/common/signal_handling.cpp create mode 100644 src/common/signal_handling.h delete mode 100644 src/training/scheduler.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index cc948bac1..9f5ee08d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Properly record cmake variables in the cmake build directory instead of the source tree. - Added default "none" for option shuffle in BatchGenerator, so that it works in executables where shuffle is not an option. - Added a few missing header files in shortlist.h and beam_search.h. +- Improved handling for receiving SIGTERM during training. By default, SIGTERM triggers 'save (now) and exit'. Prior to this fix, batch pre-fetching did not check for this sigal, potentially delaying exit considerably. It now pays attention to that. Also, the default behaviour of save-and-exit can now be disabled on the command line with --sigterm exit-immediately. ### Changed - Move Simple-WebSocket-Server to submodule diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6cb6bea42..f95941ecd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -25,6 +25,7 @@ add_library(marian STATIC common/filesystem.cpp common/file_stream.cpp common/file_utils.cpp + common/signal_handling.cpp common/types.cpp data/alignment.cpp @@ -99,7 +100,6 @@ add_library(marian STATIC training/graph_group_singleton.cpp training/validator.cpp training/communicator.cpp - training/scheduler.cpp # this is only compiled to catch build errors, but not linked microsoft/quicksand.cpp diff --git a/src/command/marian_train.cpp b/src/command/marian_train.cpp index d1978fab4..f9c6492d9 100644 --- a/src/command/marian_train.cpp +++ b/src/command/marian_train.cpp @@ -1,6 +1,7 @@ #include #include "marian.h" +#include "common/signal_handling.h" #include "training/graph_group_async.h" #include "training/graph_group_singleton.h" #include "training/graph_group_sync.h" @@ -42,14 +43,12 @@ int main(int argc, char** argv) { New>(options)->run(); } } - - // If we exit due to SIGTERM, exit with 128 + the signal number, as suggested - // for bash in http://tldp.org/LDP/abs/html/exitcodes.html. This allows parent + // If we exit due to a graceful exit request via SIGTERM, exit with 128 + SIGTERM, + // as suggested for bash in http://tldp.org/LDP/abs/html/exitcodes.html. This allows parent // scripts to determine if training terminated naturally or via SIGTERM. - // Whith this approach we can accommodate additional signals in the future. - // An alternative would be to return 124, which is what the timeout command + // An alternative would be to exit with code 124, which is what the timeout command // returns for timeout -s SIGTERM ...., because exiting after SIGTERM // is not technically a fatal error (which is what the 128+x convention usually // stands for). - return getSigtermFlag() ? (128 + SIGTERM) : 0; + exit(getSignalFlag(SIGTERM) ? 128 + SIGTERM : EXIT_SUCCESS); } diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index a44a00826..e698ddcd4 100755 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -143,6 +143,15 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) { cli.add("--dump-config", "Dump current (modified) configuration to stdout and exit. Possible values: full, minimal, expand") ->implicit_val("full"); + if(mode_ == cli::mode::training) { + // --sigterm is deliberately not a boolean, to allow for a consistent + // pattern of specifying custom signal handling in the future. + // (e.g., dump model but continue training upon SIGUSR1, or report current + // training status upon SIGINFO.) + cli.add("--sigterm", + "What to do with SIGTERM: save-and-exit or exit-immediately.", + "save-and-exit"); + } // clang-format on } diff --git a/src/common/signal_handling.cpp b/src/common/signal_handling.cpp new file mode 100644 index 000000000..8e3fd9133 --- /dev/null +++ b/src/common/signal_handling.cpp @@ -0,0 +1,58 @@ +#include "common/logging.h" +#include "signal_handling.h" + +// The simplest (and recommended) way to handle signals is to simply set a flag +// in the signal handler and check that flag later. +// +// We provide setSignalFlag as the most generic signal handler. This handler uses a +// single sig_atomic_t as a bit field. On Linux, sig_atomic_t is equivalent to a signed int, +// theoretically providing 32 binary flags; in practice, most likely signals for which we may +// want to install signal handlers are +// - SIGTERM (15): which by default signals the request for a graceful shutdown +// - SIGUSR1 (10): intended for custom use, default action in Linux is termination +// - SIGUSR2 (12): intended for custom use, default action in Linux is termination +// - SIGINT (2): interrupt from the console +// Just to be safe, we accommodate signals up to signal No. 30. + +// In addition, we also provide requestSaveAndExit() and saveAndExit() as a signal +// handler/checker for graceful shutdown requests during training. +constexpr int maxSignalForSetSignalFlag{30}; + +// Make sure sig_atomic_t is large enough as a bit field for our purposes. +// That said, I'm not aware of any platform where this would be a problem. +static_assert(SIG_ATOMIC_MAX > (1U< maxSignalForSetSignalFlag, + "Signal out of range (must be < {}, is {}).", maxSignalForSetSignalFlag, sig); + // Do bitwise AND between sigflags_ and an int value that has exactly one bit set that + // corresponds to the signal in question. If the bit is set (see setSignalFlag above), + // the bitwise AND will return a non-zero integer, if it is not set, the result will + // be zero. + return (sigflags_ & (1< +#include + +// SIGNAL HANDLING + +// The signal handlers (and checkers) here are implemented in line with with the recommendations +// for signal handling in the SEI CERT C Coding Standard, specifically +// +// - SIG30-C: +// https://wiki.sei.cmu.edu/confluence/display/c/SIG30-C.+Call+only+asynchronous-safe+functions+within+signal+handlers +// +// - SIG31-C: +// https://wiki.sei.cmu.edu/confluence/display/c/SIG31-C.+Do+not+access+shared+objects+in+signal+handlers +// +// The exact behavior of 'graceful exit' depends on the application; for training, it means 'save model and exit', +// for a server (not implemented yet): 'block new requests but serve pending requests and then exit'. +// +// Graceful exit for training is useful for training on clusters with time limits on jobs. Slurm, for example, can be +// set up to send a custom signal at a set time before the end of the time slot, giving Marian time to save its current +// state before getting killed. + +namespace marian { + + +/// Request graceful exit (signal handler) +void requestSaveAndExit(int sig); + +/// Check if graceful exit was requested. +bool saveAndExitRequested(); + +/// General purpose signal handler that simply sets a flag when a signal is received. +// (only for SIGNAL No. < 32). +void setSignalFlag(int sig); // custom handler (set flag) for sig + +/// Check if a setSignalFlag was triggered for this signal +bool getSignalFlag(int sig); + +} // End of namespace marian diff --git a/src/data/batch_generator.h b/src/data/batch_generator.h index 88d3efb92..d55e765e3 100644 --- a/src/data/batch_generator.h +++ b/src/data/batch_generator.h @@ -1,6 +1,7 @@ #pragma once #include "common/options.h" +#include "common/signal_handling.h" #include "data/batch_stats.h" #include "data/rng_engine.h" #include "training/training_state.h" @@ -136,6 +137,8 @@ class BatchGenerator : public RNGEngine { } size_t sets = 0; while(current_ != data_->end() && maxiBatch->size() < maxSize) { // loop over data + if (saveAndExitRequested()) // stop generating batches + return std::deque(); maxiBatch->push(*current_); sets = current_->size(); // do not consume more than required for the maxi batch as this causes @@ -161,6 +164,8 @@ class BatchGenerator : public RNGEngine { if (stats_) cachedStatsIter = stats_->begin(); while(!maxiBatch->empty()) { // while there are sentences in the queue + if (saveAndExitRequested()) // stop generating batches + return std::deque(); // push item onto batch batchVector.push_back(maxiBatch->top()); maxiBatch->pop(); // fetch next-shortest @@ -249,7 +254,7 @@ class BatchGenerator : public RNGEngine { "If you have changed the training corpus, add --no-restore-corpus to the training command and run it again."); bufferedBatches_ = std::move(futureBufferedBatches_.get()); // if bg thread returns an empty swath, we hit the end of the epoch - if (bufferedBatches_.empty()) { + if (bufferedBatches_.empty() || saveAndExitRequested()) { return nullptr; } // and kick off the next bg operation @@ -257,7 +262,7 @@ class BatchGenerator : public RNGEngine { } else { // don't spawn any threads, i.e. batch fetching is blocking. bufferedBatches_ = fetchBatches(); // if bufferedBatches is empty we hit the end of the epoch - if (bufferedBatches_.empty()) { + if (bufferedBatches_.empty() || saveAndExitRequested()) { return nullptr; } } diff --git a/src/training/scheduler.cpp b/src/training/scheduler.cpp deleted file mode 100644 index 4c30cb04e..000000000 --- a/src/training/scheduler.cpp +++ /dev/null @@ -1,43 +0,0 @@ -#include "scheduler.h" -#include -#include - -namespace marian { - -// SIGNAL HANDLING, see scheduler.cpp for definitions -// Currently, only the following is handled by a custom signal handler: -// SIGTERM: When SIGTERM is received, the global (static member) flag sigterm_ (false by default) is set to true -// by signalHandler(). When sigterm_ is true, keepGoing() returns false, and the current state of training models -// is saved prior to exiting. -// This functionality is helpful when training on clusters with time limits on compute slots, e.g., on s -// clusters managed by slurm. Slurm can be asked to sending a (custom) warning signal to a process at a given -// point in time prior to the hard "time's up". - -bool sigterm_{false}; // flag signalling that SIGTERM has been received false by default, set to true by signalHandler(SIGTERM) - -void signalHandler(int sig) { - // Note: sys_siglist[sig] or stdsignal() describe the effect (e.g., - // 'Terminated' rather than provide the signal name (which are #define(s) - // in signal.h), so we have to do custom log messages here. - switch (sig) { - case SIGTERM: // save models and exit - LOG(info, "[training] Scheduler received signal SIGTERM"); // @TODO: figure out if this is safe. The logs are global and thread-safe, so should be OK? - sigterm_ = true; - break; - default: - ABORT("No action defined for signal {}", sig); - } -} - -// installs signalHandler() for select signals (currently only SIGTERM) -void installSignalHandlers() { - // TODO: use sigaction instead of signal, - // cf. https://stackoverflow.com/questions/231912/what-is-the-difference-between-sigaction-and-signal - signal(SIGTERM, signalHandler); -} - -bool getSigtermFlag() { - return sigterm_; -} - -} diff --git a/src/training/scheduler.h b/src/training/scheduler.h index 8c8701cac..a3828cd32 100755 --- a/src/training/scheduler.h +++ b/src/training/scheduler.h @@ -1,6 +1,7 @@ #pragma once #include "common/options.h" +#include "common/signal_handling.h" #include "training/training_state.h" #include "training/validator.h" #include "training/communicator.h" @@ -8,9 +9,6 @@ namespace marian { -bool getSigtermFlag(); -void installSignalHandlers(); - class Scheduler : public TrainingObserver { private: Ptr options_; @@ -154,11 +152,10 @@ class Scheduler : public TrainingObserver { : options_(options), state_(state) { ABORT_IF(state_->factor != 1, "state.factor unexpectedly not 1 at this point??"); updateLearningRate(*state); - installSignalHandlers(); } bool keepGoing() { - if(getSigtermFlag()) // received signal SIGERM => exit gracefully + if(saveAndExitRequested()) // via SIGTERM return false; // stop if it reached the maximum number of epochs @@ -192,13 +189,12 @@ class Scheduler : public TrainingObserver { void started() { LOG(info, "Training started"); } void finished() { - if (getSigtermFlag()) - LOG(info, "Training interrupted (SIGTERM)."); + if (saveAndExitRequested()) + LOG(info, "Training interrupted (via signal)."); else LOG(info, "Training finished"); } - void addValidator(Ptr validator) { validators_.push_back(validator); @@ -223,9 +219,10 @@ class Scheduler : public TrainingObserver { void validate(const std::vector>& graphs, bool isFinal = false) { - // Do not validate if already validated (for instance, after the model is - // loaded) or if validation is scheduled for another update, or when signal SIGTERM was received - if(getSigtermFlag() // SIGTERM was received + // Do not validate if already validated (for instance, after the model is loaded) + // or if validation is scheduled for another update, or when a graceful shutdown + // was requested. + if(saveAndExitRequested() || state_->validated // already validated (in resumed training, for example) || (!state_->enteredNewPeriodOf(options_->get("valid-freq")) && !isFinal)) // not now return; diff --git a/src/training/training.h b/src/training/training.h index 5a2be7635..d2be8b872 100644 --- a/src/training/training.h +++ b/src/training/training.h @@ -16,6 +16,7 @@ template class Train : public ModelTask { private: Ptr options_; + void installCustomSignalHandlers(); public: Train(Ptr options) : options_(options) {} @@ -77,6 +78,9 @@ class Train : public ModelTask { bool restored = !options_->get("no-restore-corpus") && batchGenerator->restore(trainState); + // We only want custom behavior once training starts. + installCustomSignalHandlers(); + // -- main training loop scheduler->started(); while(scheduler->keepGoing()) { @@ -107,4 +111,16 @@ class Train : public ModelTask { finalizeMPI(std::move(mpi)); } }; + +template +void Train::installCustomSignalHandlers(){ + const std::string sigTermAction = options_->get("sigterm"); + if (sigTermAction == "save-and-exit") { + LOG(debug, "Will save before exiting upon SIGTERM."); + signal(SIGTERM, requestSaveAndExit); + } + else if (sigTermAction != "exit-immediately") + ABORT("Unrecognized value '{}' for --sigterm", sigTermAction); +} + } // namespace marian