diff --git a/CHANGELOG.md b/CHANGELOG.md index cc948bac1..9f5ee08d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Properly record cmake variables in the cmake build directory instead of the source tree. - Added default "none" for option shuffle in BatchGenerator, so that it works in executables where shuffle is not an option. - Added a few missing header files in shortlist.h and beam_search.h. +- Improved handling for receiving SIGTERM during training. By default, SIGTERM triggers 'save (now) and exit'. Prior to this fix, batch pre-fetching did not check for this sigal, potentially delaying exit considerably. It now pays attention to that. Also, the default behaviour of save-and-exit can now be disabled on the command line with --sigterm exit-immediately. ### Changed - Move Simple-WebSocket-Server to submodule diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6cb6bea42..f95941ecd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -25,6 +25,7 @@ add_library(marian STATIC common/filesystem.cpp common/file_stream.cpp common/file_utils.cpp + common/signal_handling.cpp common/types.cpp data/alignment.cpp @@ -99,7 +100,6 @@ add_library(marian STATIC training/graph_group_singleton.cpp training/validator.cpp training/communicator.cpp - training/scheduler.cpp # this is only compiled to catch build errors, but not linked microsoft/quicksand.cpp diff --git a/src/command/marian_train.cpp b/src/command/marian_train.cpp index d1978fab4..f9c6492d9 100644 --- a/src/command/marian_train.cpp +++ b/src/command/marian_train.cpp @@ -1,6 +1,7 @@ #include #include "marian.h" +#include "common/signal_handling.h" #include "training/graph_group_async.h" #include "training/graph_group_singleton.h" #include "training/graph_group_sync.h" @@ -42,14 +43,12 @@ int main(int argc, char** argv) { New>(options)->run(); } } - - // If we exit due to SIGTERM, exit with 128 + the signal number, as suggested - // for bash in http://tldp.org/LDP/abs/html/exitcodes.html. This allows parent + // If we exit due to a graceful exit request via SIGTERM, exit with 128 + SIGTERM, + // as suggested for bash in http://tldp.org/LDP/abs/html/exitcodes.html. This allows parent // scripts to determine if training terminated naturally or via SIGTERM. - // Whith this approach we can accommodate additional signals in the future. - // An alternative would be to return 124, which is what the timeout command + // An alternative would be to exit with code 124, which is what the timeout command // returns for timeout -s SIGTERM ...., because exiting after SIGTERM // is not technically a fatal error (which is what the 128+x convention usually // stands for). - return getSigtermFlag() ? (128 + SIGTERM) : 0; + exit(getSignalFlag(SIGTERM) ? 128 + SIGTERM : EXIT_SUCCESS); } diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index a44a00826..e698ddcd4 100755 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -143,6 +143,15 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) { cli.add("--dump-config", "Dump current (modified) configuration to stdout and exit. Possible values: full, minimal, expand") ->implicit_val("full"); + if(mode_ == cli::mode::training) { + // --sigterm is deliberately not a boolean, to allow for a consistent + // pattern of specifying custom signal handling in the future. + // (e.g., dump model but continue training upon SIGUSR1, or report current + // training status upon SIGINFO.) + cli.add("--sigterm", + "What to do with SIGTERM: save-and-exit or exit-immediately.", + "save-and-exit"); + } // clang-format on } diff --git a/src/common/signal_handling.cpp b/src/common/signal_handling.cpp new file mode 100644 index 000000000..8e3fd9133 --- /dev/null +++ b/src/common/signal_handling.cpp @@ -0,0 +1,58 @@ +#include "common/logging.h" +#include "signal_handling.h" + +// The simplest (and recommended) way to handle signals is to simply set a flag +// in the signal handler and check that flag later. +// +// We provide setSignalFlag as the most generic signal handler. This handler uses a +// single sig_atomic_t as a bit field. On Linux, sig_atomic_t is equivalent to a signed int, +// theoretically providing 32 binary flags; in practice, most likely signals for which we may +// want to install signal handlers are +// - SIGTERM (15): which by default signals the request for a graceful shutdown +// - SIGUSR1 (10): intended for custom use, default action in Linux is termination +// - SIGUSR2 (12): intended for custom use, default action in Linux is termination +// - SIGINT (2): interrupt from the console +// Just to be safe, we accommodate signals up to signal No. 30. + +// In addition, we also provide requestSaveAndExit() and saveAndExit() as a signal +// handler/checker for graceful shutdown requests during training. +constexpr int maxSignalForSetSignalFlag{30}; + +// Make sure sig_atomic_t is large enough as a bit field for our purposes. +// That said, I'm not aware of any platform where this would be a problem. +static_assert(SIG_ATOMIC_MAX > (1U< maxSignalForSetSignalFlag, + "Signal out of range (must be < {}, is {}).", maxSignalForSetSignalFlag, sig); + // Do bitwise AND between sigflags_ and an int value that has exactly one bit set that + // corresponds to the signal in question. If the bit is set (see setSignalFlag above), + // the bitwise AND will return a non-zero integer, if it is not set, the result will + // be zero. + return (sigflags_ & (1< +#include + +// SIGNAL HANDLING + +// The signal handlers (and checkers) here are implemented in line with with the recommendations +// for signal handling in the SEI CERT C Coding Standard, specifically +// +// - SIG30-C: +// https://wiki.sei.cmu.edu/confluence/display/c/SIG30-C.+Call+only+asynchronous-safe+functions+within+signal+handlers +// +// - SIG31-C: +// https://wiki.sei.cmu.edu/confluence/display/c/SIG31-C.+Do+not+access+shared+objects+in+signal+handlers +// +// The exact behavior of 'graceful exit' depends on the application; for training, it means 'save model and exit', +// for a server (not implemented yet): 'block new requests but serve pending requests and then exit'. +// +// Graceful exit for training is useful for training on clusters with time limits on jobs. Slurm, for example, can be +// set up to send a custom signal at a set time before the end of the time slot, giving Marian time to save its current +// state before getting killed. + +namespace marian { + + +/// Request graceful exit (signal handler) +void requestSaveAndExit(int sig); + +/// Check if graceful exit was requested. +bool saveAndExitRequested(); + +/// General purpose signal handler that simply sets a flag when a signal is received. +// (only for SIGNAL No. < 32). +void setSignalFlag(int sig); // custom handler (set flag) for sig + +/// Check if a setSignalFlag was triggered for this signal +bool getSignalFlag(int sig); + +} // End of namespace marian diff --git a/src/data/batch_generator.h b/src/data/batch_generator.h index 88d3efb92..d55e765e3 100644 --- a/src/data/batch_generator.h +++ b/src/data/batch_generator.h @@ -1,6 +1,7 @@ #pragma once #include "common/options.h" +#include "common/signal_handling.h" #include "data/batch_stats.h" #include "data/rng_engine.h" #include "training/training_state.h" @@ -136,6 +137,8 @@ class BatchGenerator : public RNGEngine { } size_t sets = 0; while(current_ != data_->end() && maxiBatch->size() < maxSize) { // loop over data + if (saveAndExitRequested()) // stop generating batches + return std::deque(); maxiBatch->push(*current_); sets = current_->size(); // do not consume more than required for the maxi batch as this causes @@ -161,6 +164,8 @@ class BatchGenerator : public RNGEngine { if (stats_) cachedStatsIter = stats_->begin(); while(!maxiBatch->empty()) { // while there are sentences in the queue + if (saveAndExitRequested()) // stop generating batches + return std::deque(); // push item onto batch batchVector.push_back(maxiBatch->top()); maxiBatch->pop(); // fetch next-shortest @@ -249,7 +254,7 @@ class BatchGenerator : public RNGEngine { "If you have changed the training corpus, add --no-restore-corpus to the training command and run it again."); bufferedBatches_ = std::move(futureBufferedBatches_.get()); // if bg thread returns an empty swath, we hit the end of the epoch - if (bufferedBatches_.empty()) { + if (bufferedBatches_.empty() || saveAndExitRequested()) { return nullptr; } // and kick off the next bg operation @@ -257,7 +262,7 @@ class BatchGenerator : public RNGEngine { } else { // don't spawn any threads, i.e. batch fetching is blocking. bufferedBatches_ = fetchBatches(); // if bufferedBatches is empty we hit the end of the epoch - if (bufferedBatches_.empty()) { + if (bufferedBatches_.empty() || saveAndExitRequested()) { return nullptr; } } diff --git a/src/training/scheduler.cpp b/src/training/scheduler.cpp deleted file mode 100644 index 4c30cb04e..000000000 --- a/src/training/scheduler.cpp +++ /dev/null @@ -1,43 +0,0 @@ -#include "scheduler.h" -#include -#include - -namespace marian { - -// SIGNAL HANDLING, see scheduler.cpp for definitions -// Currently, only the following is handled by a custom signal handler: -// SIGTERM: When SIGTERM is received, the global (static member) flag sigterm_ (false by default) is set to true -// by signalHandler(). When sigterm_ is true, keepGoing() returns false, and the current state of training models -// is saved prior to exiting. -// This functionality is helpful when training on clusters with time limits on compute slots, e.g., on s -// clusters managed by slurm. Slurm can be asked to sending a (custom) warning signal to a process at a given -// point in time prior to the hard "time's up". - -bool sigterm_{false}; // flag signalling that SIGTERM has been received false by default, set to true by signalHandler(SIGTERM) - -void signalHandler(int sig) { - // Note: sys_siglist[sig] or stdsignal() describe the effect (e.g., - // 'Terminated' rather than provide the signal name (which are #define(s) - // in signal.h), so we have to do custom log messages here. - switch (sig) { - case SIGTERM: // save models and exit - LOG(info, "[training] Scheduler received signal SIGTERM"); // @TODO: figure out if this is safe. The logs are global and thread-safe, so should be OK? - sigterm_ = true; - break; - default: - ABORT("No action defined for signal {}", sig); - } -} - -// installs signalHandler() for select signals (currently only SIGTERM) -void installSignalHandlers() { - // TODO: use sigaction instead of signal, - // cf. https://stackoverflow.com/questions/231912/what-is-the-difference-between-sigaction-and-signal - signal(SIGTERM, signalHandler); -} - -bool getSigtermFlag() { - return sigterm_; -} - -} diff --git a/src/training/scheduler.h b/src/training/scheduler.h index 8c8701cac..a3828cd32 100755 --- a/src/training/scheduler.h +++ b/src/training/scheduler.h @@ -1,6 +1,7 @@ #pragma once #include "common/options.h" +#include "common/signal_handling.h" #include "training/training_state.h" #include "training/validator.h" #include "training/communicator.h" @@ -8,9 +9,6 @@ namespace marian { -bool getSigtermFlag(); -void installSignalHandlers(); - class Scheduler : public TrainingObserver { private: Ptr options_; @@ -154,11 +152,10 @@ class Scheduler : public TrainingObserver { : options_(options), state_(state) { ABORT_IF(state_->factor != 1, "state.factor unexpectedly not 1 at this point??"); updateLearningRate(*state); - installSignalHandlers(); } bool keepGoing() { - if(getSigtermFlag()) // received signal SIGERM => exit gracefully + if(saveAndExitRequested()) // via SIGTERM return false; // stop if it reached the maximum number of epochs @@ -192,13 +189,12 @@ class Scheduler : public TrainingObserver { void started() { LOG(info, "Training started"); } void finished() { - if (getSigtermFlag()) - LOG(info, "Training interrupted (SIGTERM)."); + if (saveAndExitRequested()) + LOG(info, "Training interrupted (via signal)."); else LOG(info, "Training finished"); } - void addValidator(Ptr validator) { validators_.push_back(validator); @@ -223,9 +219,10 @@ class Scheduler : public TrainingObserver { void validate(const std::vector>& graphs, bool isFinal = false) { - // Do not validate if already validated (for instance, after the model is - // loaded) or if validation is scheduled for another update, or when signal SIGTERM was received - if(getSigtermFlag() // SIGTERM was received + // Do not validate if already validated (for instance, after the model is loaded) + // or if validation is scheduled for another update, or when a graceful shutdown + // was requested. + if(saveAndExitRequested() || state_->validated // already validated (in resumed training, for example) || (!state_->enteredNewPeriodOf(options_->get("valid-freq")) && !isFinal)) // not now return; diff --git a/src/training/training.h b/src/training/training.h index 5a2be7635..d2be8b872 100644 --- a/src/training/training.h +++ b/src/training/training.h @@ -16,6 +16,7 @@ template class Train : public ModelTask { private: Ptr options_; + void installCustomSignalHandlers(); public: Train(Ptr options) : options_(options) {} @@ -77,6 +78,9 @@ class Train : public ModelTask { bool restored = !options_->get("no-restore-corpus") && batchGenerator->restore(trainState); + // We only want custom behavior once training starts. + installCustomSignalHandlers(); + // -- main training loop scheduler->started(); while(scheduler->keepGoing()) { @@ -107,4 +111,16 @@ class Train : public ModelTask { finalizeMPI(std::move(mpi)); } }; + +template +void Train::installCustomSignalHandlers(){ + const std::string sigTermAction = options_->get("sigterm"); + if (sigTermAction == "save-and-exit") { + LOG(debug, "Will save before exiting upon SIGTERM."); + signal(SIGTERM, requestSaveAndExit); + } + else if (sigTermAction != "exit-immediately") + ABORT("Unrecognized value '{}' for --sigterm", sigTermAction); +} + } // namespace marian