Skip to content

Commit

Permalink
Replace checkpoint file atomically
Browse files Browse the repository at this point in the history
These changes modify the strategy used to store a checkpoint file.

Previously the storage operation consisted of (a) moving the current
checkpoint file to a backup, and then (b) storing the contents of the
checkpoint file. If the server was terminated during (b), the result
would be a corrupt checkpoint file.
The new strategy consists of (a) storing the contents of the checkpoint
file in a temporary file; (b) moving the current checkpoint to backup;
and, finally, (c) promoting the temporary checkpoint file to the current
by simply renaming the file. In this case, even if the storage operation
is interrupted, the current checkpoint file will never lack integrity.

Re ECFLOW-1925
  • Loading branch information
marcosbento committed Nov 14, 2023
1 parent 3fca261 commit 9e44c54
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 21 deletions.
1 change: 1 addition & 0 deletions Server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ ecbuild_add_test(
INCLUDES
src
SOURCES
test/TestCheckPtSaver.cpp
test/TestServerEnvironment.cpp
test/TestServer_main.cpp # test entry point
test/TestServer1.cpp
Expand Down
34 changes: 13 additions & 21 deletions Server/src/CheckPtSaver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,28 +99,20 @@ void CheckPtSaver::terminate() {
}

bool CheckPtSaver::explicitSave(bool from_server) const {
bool ret = true;

try {
#ifdef DEBUG_CHECKPT
std::cout << " CheckPtSaver::explicitSave() Saving checkpt file " << serverEnv_->checkPtFilename() << "\n";
#endif
// Time how long we take to checkpt, Help to recognise *SLOW* disk, which can *AFFECT* server performance
DurationTimer durationTimer;
// In order to detect *SLOW* operations (i.e. disk related), which *AFFECT* server performance,
// the following timer measures how long we take to store the CheckPoint file.
// After the Definitions are stored, if a configured warning threshold is passed a warning is issues.
DurationTimer timer;

// Backup checkpoint file if it exists & is non zero
// Avoid an empty file as a backup file, could results from a full file system
// i.e move ecf_checkpt_file --> ecf_backup_checkpt_file
fs::path checkPtFile(serverEnv_->checkPtFilename());
if (fs::exists(checkPtFile) && fs::file_size(checkPtFile) != 0) {

fs::path oldCheckPtFile(serverEnv_->oldCheckPtFilename());
fs::remove(oldCheckPtFile);
fs::rename(checkPtFile, oldCheckPtFile);
}

// write to ecf_checkpt_file, if file system is full this could result in an empty file. ?
server_->defs_->save_as_checkpt(serverEnv_->checkPtFilename());
fs::path oldCheckPtFile(serverEnv_->oldCheckPtFilename());
CheckPtSaver::storeWithBackup(checkPtFile, oldCheckPtFile, [&](const fs::path& file_path) {
server_->defs_->save_as_checkpt(file_path.string());
});

state_change_no_ = Ecf::state_change_no(); // For periodic update only save checkPt if it has changed
modify_change_no_ = Ecf::modify_change_no(); // For periodic update only save checkPt if it has changed
Expand All @@ -131,16 +123,16 @@ bool CheckPtSaver::explicitSave(bool from_server) const {
std::string msg = Str::SVR_CMD();
msg += CtsApi::checkPtDefsArg();
std::stringstream ss;
ss << msg << " in " << durationTimer.duration() << " seconds";
ss << msg << " in " << timer.duration() << " seconds";
log(Log::MSG, ss.str());
}

/// If Save take longer than checkpt_save_time_alarm, then set a flag on server
/// So that user can be aware of it.
if (static_cast<size_t>(durationTimer.duration()) > server_->serverEnv_.checkpt_save_time_alarm()) {
if (static_cast<size_t>(timer.duration()) > server_->serverEnv_.checkpt_save_time_alarm()) {
server_->defs_->flag().set(ecf::Flag::LATE);
std::stringstream ss;
ss << "Check pt save time(" << durationTimer.duration() << ") is greater than alarm time("
ss << "Check pt save time(" << timer.duration() << ") is greater than alarm time("
<< server_->serverEnv_.checkpt_save_time_alarm()
<< "). Excessive save times can interfere with scheduling!";
log(Log::WAR, ss.str());
Expand All @@ -150,15 +142,15 @@ bool CheckPtSaver::explicitSave(bool from_server) const {
#endif
}
catch (std::exception& e) {
ret = false;
std::string msg = "Could not save checkPoint file! ";
msg += e.what();
server_->defs_->flag().set(ecf::Flag::CHECKPT_ERROR);
server_->defs()->set_server().add_or_update_user_variables("ECF_CHECKPT_ERROR", msg);
LOG(Log::ERR, msg);
return false;
}

return ret;
return true;
}

void CheckPtSaver::periodicSaveCheckPt(const boost::system::error_code& error) {
Expand Down
42 changes: 42 additions & 0 deletions Server/src/CheckPtSaver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8

#include <boost/asio.hpp>
#include <boost/filesystem.hpp>

namespace fs = boost::filesystem;

class ServerEnvironment;
class BaseServer;

Expand Down Expand Up @@ -61,6 +65,43 @@ class CheckPtSaver {
/// CheckPt::ALWAYS - will save immediately, may cause performance issues with large Node trees
void saveIfAllowed();

/**
* Stores a set of data in the current file location, ensuring a backup file retains the previous version.
*
* @tparam STORE the type of the function that stores the data
* @param current the file path to the current data, which will become the backup
* @param backup the file path to the previous backup, to be discarded
* @param store the function that actually performs the storage
*/
template <typename STORE>
static void storeWithBackup(const fs::path& current, const fs::path& backup, STORE store) {
// 1. Save the Definitions to a temporary CheckPoint file
//
// In case an issue occurs while saving the CheckPoint file (e.g. thrown
// exception, or server crash), the procedure is always interrupted without
// changes to the current+old CheckPoint files in order to facilitate recovery.
//
fs::path temporary = fs::unique_path(current.string() + "%%%%-%%%%-%%%%-%%%%");
store(temporary.string());

// 2. Backup the current CheckPoint files
//
// Important: The current checkpoint is only backed up if it exists
// and has size greater than zero bytes. This approach avoid backing up
// empty files, resulting from a full file system, and overriding relevant
// old content that might be otherwise available.
//
// The backup effectively replaces the old with the current CheckPoint file.
//
if (fs::exists(current) && fs::file_size(current) > 0) {
fs::remove(backup);
fs::rename(current, backup);
}

// 3. Promote the temporary to current CheckPoint file
fs::rename(temporary, current);
}

private:
/// save the node tree in the server to a checkPt file.
/// this is controlled by the configuration. If the configuration does not
Expand All @@ -84,4 +125,5 @@ class CheckPtSaver {
mutable unsigned int state_change_no_; // detect state change in defs
mutable unsigned int modify_change_no_; // detect state change in defs
};

#endif
128 changes: 128 additions & 0 deletions Server/test/TestCheckPtSaver.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/*
* Copyright 2023- ECMWF.
*
* This software is licensed under the terms of the Apache Licence version 2.0
* which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
* In applying this licence, ECMWF does not waive the privileges and immunities
* granted to it by virtue of its status as an intergovernmental organisation
* nor does it submit to any jurisdiction.
*/

#include <fstream>
#include <iostream>

#include <boost/test/unit_test.hpp>

#include "CheckPtSaver.hpp"

// TODO: Move the following trace_*_function to a common test module/library
static void trace_test() {
std::cout << "..." << boost::unit_test::framework::current_test_case().full_name() << "\n";
}

/**
* A self cleaning test file, useful to automate test data storage/cleanup, with automatic generation of file names.
*/
class TestFile {
public:
/**
* Create a handle for a 'non-existent' data file
*/
explicit TestFile(const std::string& basename) : absolute_path_(TestFile::unique_absolute_path(basename)) {}

/**
* Create a handle for a data file, with the given content
*/
TestFile(const std::string& basename, const std::string& content)
: absolute_path_(TestFile::unique_absolute_path(basename)) {
f_store(absolute_path_.string(), content);
}
~TestFile() { fs::remove(absolute_path_); }

[[nodiscard]] const fs::path& path() const { return absolute_path_; }
[[nodiscard]] std::string load() const { return f_load(absolute_path_); }

static fs::path unique_absolute_path(const std::string& basename) {
return fs::absolute(fs::unique_path(basename));
}

private:
static void f_store(const fs::path& outfile, const std::string& data) {
std::ofstream ofs(outfile.string());
ofs << data;
}

static std::string f_load(const fs::path& infile) {
std::ifstream ifs(infile.string());
std::string buffer((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
return buffer;
}

private:
fs::path absolute_path_;
};

void handle_store(const fs::path& temporary) {
std::ofstream ofs(temporary.string());
ofs << "2";
};

BOOST_AUTO_TEST_SUITE(TestServer)

BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case0) {
trace_test();

// Case 0: neither current and backup exist

TestFile current("current_%%%%-%%%%-%%%%"); // Important: no actual file is created
TestFile backup("backup_%%%%-%%%%-%%%%"); // Important: no actual file is created

CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store);

BOOST_CHECK_EQUAL(current.load(), "2");
BOOST_CHECK(!fs::exists(backup.path()));
}

BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case1) {
trace_test();

// Case 1: current and backup both exist and are non-empty

TestFile current("current_%%%%-%%%%-%%%%", "1");
TestFile backup("backup_%%%%-%%%%-%%%%", "X");

CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store);

BOOST_CHECK_EQUAL(current.load(), "2");
BOOST_CHECK_EQUAL(backup.load(), "1");
}

BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case2) {
trace_test();

// Case 2: current and backup both exist, and current is empty

TestFile current("current_%%%%-%%%%-%%%%", ""); // Important: file is created empty
TestFile backup("backup_%%%%-%%%%-%%%%", "X");

CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store);

BOOST_CHECK_EQUAL(current.load(), "2");
BOOST_CHECK_EQUAL(backup.load(), "X");
}

BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case3) {
trace_test();

// Case 3: current exists, but no backup is available

TestFile current("current_%%%%-%%%%-%%%%", "1");
TestFile backup("backup_%%%%-%%%%-%%%%"); // Important: no actual file is created

CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store);

BOOST_CHECK_EQUAL(current.load(), "2");
BOOST_CHECK_EQUAL(backup.load(), "1");
}

BOOST_AUTO_TEST_SUITE_END()

0 comments on commit 9e44c54

Please sign in to comment.