From 9e44c545bdd87027bd8b7713488da34f4cc1dc32 Mon Sep 17 00:00:00 2001 From: Marcos Bento Date: Tue, 7 Nov 2023 09:40:31 +0000 Subject: [PATCH] Replace checkpoint file atomically These changes modify the strategy used to store a checkpoint file. Previously the storage operation consisted of (a) moving the current checkpoint file to a backup, and then (b) storing the contents of the checkpoint file. If the server was terminated during (b), the result would be a corrupt checkpoint file. The new strategy consists of (a) storing the contents of the checkpoint file in a temporary file; (b) moving the current checkpoint to backup; and, finally, (c) promoting the temporary checkpoint file to the current by simply renaming the file. In this case, even if the storage operation is interrupted, the current checkpoint file will never lack integrity. Re ECFLOW-1925 --- Server/CMakeLists.txt | 1 + Server/src/CheckPtSaver.cpp | 34 ++++---- Server/src/CheckPtSaver.hpp | 42 ++++++++++ Server/test/TestCheckPtSaver.cpp | 128 +++++++++++++++++++++++++++++++ 4 files changed, 184 insertions(+), 21 deletions(-) create mode 100644 Server/test/TestCheckPtSaver.cpp diff --git a/Server/CMakeLists.txt b/Server/CMakeLists.txt index 5153cceeb..08021c688 100644 --- a/Server/CMakeLists.txt +++ b/Server/CMakeLists.txt @@ -98,6 +98,7 @@ ecbuild_add_test( INCLUDES src SOURCES + test/TestCheckPtSaver.cpp test/TestServerEnvironment.cpp test/TestServer_main.cpp # test entry point test/TestServer1.cpp diff --git a/Server/src/CheckPtSaver.cpp b/Server/src/CheckPtSaver.cpp index b17756dbb..563b77601 100644 --- a/Server/src/CheckPtSaver.cpp +++ b/Server/src/CheckPtSaver.cpp @@ -99,28 +99,20 @@ void CheckPtSaver::terminate() { } bool CheckPtSaver::explicitSave(bool from_server) const { - bool ret = true; - try { #ifdef DEBUG_CHECKPT std::cout << " CheckPtSaver::explicitSave() Saving checkpt file " << serverEnv_->checkPtFilename() << "\n"; #endif - // Time how long we take to checkpt, Help to recognise *SLOW* disk, which can *AFFECT* server performance - DurationTimer durationTimer; + // In order to detect *SLOW* operations (i.e. disk related), which *AFFECT* server performance, + // the following timer measures how long we take to store the CheckPoint file. + // After the Definitions are stored, if a configured warning threshold is passed a warning is issues. + DurationTimer timer; - // Backup checkpoint file if it exists & is non zero - // Avoid an empty file as a backup file, could results from a full file system - // i.e move ecf_checkpt_file --> ecf_backup_checkpt_file fs::path checkPtFile(serverEnv_->checkPtFilename()); - if (fs::exists(checkPtFile) && fs::file_size(checkPtFile) != 0) { - - fs::path oldCheckPtFile(serverEnv_->oldCheckPtFilename()); - fs::remove(oldCheckPtFile); - fs::rename(checkPtFile, oldCheckPtFile); - } - - // write to ecf_checkpt_file, if file system is full this could result in an empty file. ? - server_->defs_->save_as_checkpt(serverEnv_->checkPtFilename()); + fs::path oldCheckPtFile(serverEnv_->oldCheckPtFilename()); + CheckPtSaver::storeWithBackup(checkPtFile, oldCheckPtFile, [&](const fs::path& file_path) { + server_->defs_->save_as_checkpt(file_path.string()); + }); state_change_no_ = Ecf::state_change_no(); // For periodic update only save checkPt if it has changed modify_change_no_ = Ecf::modify_change_no(); // For periodic update only save checkPt if it has changed @@ -131,16 +123,16 @@ bool CheckPtSaver::explicitSave(bool from_server) const { std::string msg = Str::SVR_CMD(); msg += CtsApi::checkPtDefsArg(); std::stringstream ss; - ss << msg << " in " << durationTimer.duration() << " seconds"; + ss << msg << " in " << timer.duration() << " seconds"; log(Log::MSG, ss.str()); } /// If Save take longer than checkpt_save_time_alarm, then set a flag on server /// So that user can be aware of it. - if (static_cast(durationTimer.duration()) > server_->serverEnv_.checkpt_save_time_alarm()) { + if (static_cast(timer.duration()) > server_->serverEnv_.checkpt_save_time_alarm()) { server_->defs_->flag().set(ecf::Flag::LATE); std::stringstream ss; - ss << "Check pt save time(" << durationTimer.duration() << ") is greater than alarm time(" + ss << "Check pt save time(" << timer.duration() << ") is greater than alarm time(" << server_->serverEnv_.checkpt_save_time_alarm() << "). Excessive save times can interfere with scheduling!"; log(Log::WAR, ss.str()); @@ -150,15 +142,15 @@ bool CheckPtSaver::explicitSave(bool from_server) const { #endif } catch (std::exception& e) { - ret = false; std::string msg = "Could not save checkPoint file! "; msg += e.what(); server_->defs_->flag().set(ecf::Flag::CHECKPT_ERROR); server_->defs()->set_server().add_or_update_user_variables("ECF_CHECKPT_ERROR", msg); LOG(Log::ERR, msg); + return false; } - return ret; + return true; } void CheckPtSaver::periodicSaveCheckPt(const boost::system::error_code& error) { diff --git a/Server/src/CheckPtSaver.hpp b/Server/src/CheckPtSaver.hpp index 99eda0f6c..b7171ad32 100644 --- a/Server/src/CheckPtSaver.hpp +++ b/Server/src/CheckPtSaver.hpp @@ -24,6 +24,10 @@ /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 #include +#include + +namespace fs = boost::filesystem; + class ServerEnvironment; class BaseServer; @@ -61,6 +65,43 @@ class CheckPtSaver { /// CheckPt::ALWAYS - will save immediately, may cause performance issues with large Node trees void saveIfAllowed(); + /** + * Stores a set of data in the current file location, ensuring a backup file retains the previous version. + * + * @tparam STORE the type of the function that stores the data + * @param current the file path to the current data, which will become the backup + * @param backup the file path to the previous backup, to be discarded + * @param store the function that actually performs the storage + */ + template + static void storeWithBackup(const fs::path& current, const fs::path& backup, STORE store) { + // 1. Save the Definitions to a temporary CheckPoint file + // + // In case an issue occurs while saving the CheckPoint file (e.g. thrown + // exception, or server crash), the procedure is always interrupted without + // changes to the current+old CheckPoint files in order to facilitate recovery. + // + fs::path temporary = fs::unique_path(current.string() + "%%%%-%%%%-%%%%-%%%%"); + store(temporary.string()); + + // 2. Backup the current CheckPoint files + // + // Important: The current checkpoint is only backed up if it exists + // and has size greater than zero bytes. This approach avoid backing up + // empty files, resulting from a full file system, and overriding relevant + // old content that might be otherwise available. + // + // The backup effectively replaces the old with the current CheckPoint file. + // + if (fs::exists(current) && fs::file_size(current) > 0) { + fs::remove(backup); + fs::rename(current, backup); + } + + // 3. Promote the temporary to current CheckPoint file + fs::rename(temporary, current); + } + private: /// save the node tree in the server to a checkPt file. /// this is controlled by the configuration. If the configuration does not @@ -84,4 +125,5 @@ class CheckPtSaver { mutable unsigned int state_change_no_; // detect state change in defs mutable unsigned int modify_change_no_; // detect state change in defs }; + #endif diff --git a/Server/test/TestCheckPtSaver.cpp b/Server/test/TestCheckPtSaver.cpp new file mode 100644 index 000000000..d2cdb43f8 --- /dev/null +++ b/Server/test/TestCheckPtSaver.cpp @@ -0,0 +1,128 @@ +/* + * Copyright 2023- ECMWF. + * + * This software is licensed under the terms of the Apache Licence version 2.0 + * which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. + * In applying this licence, ECMWF does not waive the privileges and immunities + * granted to it by virtue of its status as an intergovernmental organisation + * nor does it submit to any jurisdiction. + */ + +#include +#include + +#include + +#include "CheckPtSaver.hpp" + +// TODO: Move the following trace_*_function to a common test module/library +static void trace_test() { + std::cout << "..." << boost::unit_test::framework::current_test_case().full_name() << "\n"; +} + +/** + * A self cleaning test file, useful to automate test data storage/cleanup, with automatic generation of file names. + */ +class TestFile { +public: + /** + * Create a handle for a 'non-existent' data file + */ + explicit TestFile(const std::string& basename) : absolute_path_(TestFile::unique_absolute_path(basename)) {} + + /** + * Create a handle for a data file, with the given content + */ + TestFile(const std::string& basename, const std::string& content) + : absolute_path_(TestFile::unique_absolute_path(basename)) { + f_store(absolute_path_.string(), content); + } + ~TestFile() { fs::remove(absolute_path_); } + + [[nodiscard]] const fs::path& path() const { return absolute_path_; } + [[nodiscard]] std::string load() const { return f_load(absolute_path_); } + + static fs::path unique_absolute_path(const std::string& basename) { + return fs::absolute(fs::unique_path(basename)); + } + +private: + static void f_store(const fs::path& outfile, const std::string& data) { + std::ofstream ofs(outfile.string()); + ofs << data; + } + + static std::string f_load(const fs::path& infile) { + std::ifstream ifs(infile.string()); + std::string buffer((std::istreambuf_iterator(ifs)), std::istreambuf_iterator()); + return buffer; + } + +private: + fs::path absolute_path_; +}; + +void handle_store(const fs::path& temporary) { + std::ofstream ofs(temporary.string()); + ofs << "2"; +}; + +BOOST_AUTO_TEST_SUITE(TestServer) + +BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case0) { + trace_test(); + + // Case 0: neither current and backup exist + + TestFile current("current_%%%%-%%%%-%%%%"); // Important: no actual file is created + TestFile backup("backup_%%%%-%%%%-%%%%"); // Important: no actual file is created + + CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store); + + BOOST_CHECK_EQUAL(current.load(), "2"); + BOOST_CHECK(!fs::exists(backup.path())); +} + +BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case1) { + trace_test(); + + // Case 1: current and backup both exist and are non-empty + + TestFile current("current_%%%%-%%%%-%%%%", "1"); + TestFile backup("backup_%%%%-%%%%-%%%%", "X"); + + CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store); + + BOOST_CHECK_EQUAL(current.load(), "2"); + BOOST_CHECK_EQUAL(backup.load(), "1"); +} + +BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case2) { + trace_test(); + + // Case 2: current and backup both exist, and current is empty + + TestFile current("current_%%%%-%%%%-%%%%", ""); // Important: file is created empty + TestFile backup("backup_%%%%-%%%%-%%%%", "X"); + + CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store); + + BOOST_CHECK_EQUAL(current.load(), "2"); + BOOST_CHECK_EQUAL(backup.load(), "X"); +} + +BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case3) { + trace_test(); + + // Case 3: current exists, but no backup is available + + TestFile current("current_%%%%-%%%%-%%%%", "1"); + TestFile backup("backup_%%%%-%%%%-%%%%"); // Important: no actual file is created + + CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store); + + BOOST_CHECK_EQUAL(current.load(), "2"); + BOOST_CHECK_EQUAL(backup.load(), "1"); +} + +BOOST_AUTO_TEST_SUITE_END()