Skip to content

Commit

Permalink
Replace checkpoint file atomically ECFLOW-1925
Browse files Browse the repository at this point in the history
  • Loading branch information
marcosbento authored Nov 14, 2023
2 parents 3fca261 + 9e44c54 commit e53d71b
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 21 deletions.
1 change: 1 addition & 0 deletions Server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ ecbuild_add_test(
INCLUDES
src
SOURCES
test/TestCheckPtSaver.cpp
test/TestServerEnvironment.cpp
test/TestServer_main.cpp # test entry point
test/TestServer1.cpp
Expand Down
34 changes: 13 additions & 21 deletions Server/src/CheckPtSaver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,28 +99,20 @@ void CheckPtSaver::terminate() {
}

bool CheckPtSaver::explicitSave(bool from_server) const {
bool ret = true;

try {
#ifdef DEBUG_CHECKPT
std::cout << " CheckPtSaver::explicitSave() Saving checkpt file " << serverEnv_->checkPtFilename() << "\n";
#endif
// Time how long we take to checkpt, Help to recognise *SLOW* disk, which can *AFFECT* server performance
DurationTimer durationTimer;
// In order to detect *SLOW* operations (i.e. disk related), which *AFFECT* server performance,
// the following timer measures how long we take to store the CheckPoint file.
// After the Definitions are stored, if a configured warning threshold is passed a warning is issues.
DurationTimer timer;

// Backup checkpoint file if it exists & is non zero
// Avoid an empty file as a backup file, could results from a full file system
// i.e move ecf_checkpt_file --> ecf_backup_checkpt_file
fs::path checkPtFile(serverEnv_->checkPtFilename());
if (fs::exists(checkPtFile) && fs::file_size(checkPtFile) != 0) {

fs::path oldCheckPtFile(serverEnv_->oldCheckPtFilename());
fs::remove(oldCheckPtFile);
fs::rename(checkPtFile, oldCheckPtFile);
}

// write to ecf_checkpt_file, if file system is full this could result in an empty file. ?
server_->defs_->save_as_checkpt(serverEnv_->checkPtFilename());
fs::path oldCheckPtFile(serverEnv_->oldCheckPtFilename());
CheckPtSaver::storeWithBackup(checkPtFile, oldCheckPtFile, [&](const fs::path& file_path) {
server_->defs_->save_as_checkpt(file_path.string());
});

state_change_no_ = Ecf::state_change_no(); // For periodic update only save checkPt if it has changed
modify_change_no_ = Ecf::modify_change_no(); // For periodic update only save checkPt if it has changed
Expand All @@ -131,16 +123,16 @@ bool CheckPtSaver::explicitSave(bool from_server) const {
std::string msg = Str::SVR_CMD();
msg += CtsApi::checkPtDefsArg();
std::stringstream ss;
ss << msg << " in " << durationTimer.duration() << " seconds";
ss << msg << " in " << timer.duration() << " seconds";
log(Log::MSG, ss.str());
}

/// If Save take longer than checkpt_save_time_alarm, then set a flag on server
/// So that user can be aware of it.
if (static_cast<size_t>(durationTimer.duration()) > server_->serverEnv_.checkpt_save_time_alarm()) {
if (static_cast<size_t>(timer.duration()) > server_->serverEnv_.checkpt_save_time_alarm()) {
server_->defs_->flag().set(ecf::Flag::LATE);
std::stringstream ss;
ss << "Check pt save time(" << durationTimer.duration() << ") is greater than alarm time("
ss << "Check pt save time(" << timer.duration() << ") is greater than alarm time("
<< server_->serverEnv_.checkpt_save_time_alarm()
<< "). Excessive save times can interfere with scheduling!";
log(Log::WAR, ss.str());
Expand All @@ -150,15 +142,15 @@ bool CheckPtSaver::explicitSave(bool from_server) const {
#endif
}
catch (std::exception& e) {
ret = false;
std::string msg = "Could not save checkPoint file! ";
msg += e.what();
server_->defs_->flag().set(ecf::Flag::CHECKPT_ERROR);
server_->defs()->set_server().add_or_update_user_variables("ECF_CHECKPT_ERROR", msg);
LOG(Log::ERR, msg);
return false;
}

return ret;
return true;
}

void CheckPtSaver::periodicSaveCheckPt(const boost::system::error_code& error) {
Expand Down
42 changes: 42 additions & 0 deletions Server/src/CheckPtSaver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8

#include <boost/asio.hpp>
#include <boost/filesystem.hpp>

namespace fs = boost::filesystem;

class ServerEnvironment;
class BaseServer;

Expand Down Expand Up @@ -61,6 +65,43 @@ class CheckPtSaver {
/// CheckPt::ALWAYS - will save immediately, may cause performance issues with large Node trees
void saveIfAllowed();

/**
* Stores a set of data in the current file location, ensuring a backup file retains the previous version.
*
* @tparam STORE the type of the function that stores the data
* @param current the file path to the current data, which will become the backup
* @param backup the file path to the previous backup, to be discarded
* @param store the function that actually performs the storage
*/
template <typename STORE>
static void storeWithBackup(const fs::path& current, const fs::path& backup, STORE store) {
// 1. Save the Definitions to a temporary CheckPoint file
//
// In case an issue occurs while saving the CheckPoint file (e.g. thrown
// exception, or server crash), the procedure is always interrupted without
// changes to the current+old CheckPoint files in order to facilitate recovery.
//
fs::path temporary = fs::unique_path(current.string() + "%%%%-%%%%-%%%%-%%%%");
store(temporary.string());

// 2. Backup the current CheckPoint files
//
// Important: The current checkpoint is only backed up if it exists
// and has size greater than zero bytes. This approach avoid backing up
// empty files, resulting from a full file system, and overriding relevant
// old content that might be otherwise available.
//
// The backup effectively replaces the old with the current CheckPoint file.
//
if (fs::exists(current) && fs::file_size(current) > 0) {
fs::remove(backup);
fs::rename(current, backup);
}

// 3. Promote the temporary to current CheckPoint file
fs::rename(temporary, current);
}

private:
/// save the node tree in the server to a checkPt file.
/// this is controlled by the configuration. If the configuration does not
Expand All @@ -84,4 +125,5 @@ class CheckPtSaver {
mutable unsigned int state_change_no_; // detect state change in defs
mutable unsigned int modify_change_no_; // detect state change in defs
};

#endif
128 changes: 128 additions & 0 deletions Server/test/TestCheckPtSaver.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/*
* Copyright 2023- ECMWF.
*
* This software is licensed under the terms of the Apache Licence version 2.0
* which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
* In applying this licence, ECMWF does not waive the privileges and immunities
* granted to it by virtue of its status as an intergovernmental organisation
* nor does it submit to any jurisdiction.
*/

#include <fstream>
#include <iostream>

#include <boost/test/unit_test.hpp>

#include "CheckPtSaver.hpp"

// TODO: Move the following trace_*_function to a common test module/library
static void trace_test() {
std::cout << "..." << boost::unit_test::framework::current_test_case().full_name() << "\n";
}

/**
* A self cleaning test file, useful to automate test data storage/cleanup, with automatic generation of file names.
*/
class TestFile {
public:
/**
* Create a handle for a 'non-existent' data file
*/
explicit TestFile(const std::string& basename) : absolute_path_(TestFile::unique_absolute_path(basename)) {}

/**
* Create a handle for a data file, with the given content
*/
TestFile(const std::string& basename, const std::string& content)
: absolute_path_(TestFile::unique_absolute_path(basename)) {
f_store(absolute_path_.string(), content);
}
~TestFile() { fs::remove(absolute_path_); }

[[nodiscard]] const fs::path& path() const { return absolute_path_; }
[[nodiscard]] std::string load() const { return f_load(absolute_path_); }

static fs::path unique_absolute_path(const std::string& basename) {
return fs::absolute(fs::unique_path(basename));
}

private:
static void f_store(const fs::path& outfile, const std::string& data) {
std::ofstream ofs(outfile.string());
ofs << data;
}

static std::string f_load(const fs::path& infile) {
std::ifstream ifs(infile.string());
std::string buffer((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
return buffer;
}

private:
fs::path absolute_path_;
};

void handle_store(const fs::path& temporary) {
std::ofstream ofs(temporary.string());
ofs << "2";
};

BOOST_AUTO_TEST_SUITE(TestServer)

BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case0) {
trace_test();

// Case 0: neither current and backup exist

TestFile current("current_%%%%-%%%%-%%%%"); // Important: no actual file is created
TestFile backup("backup_%%%%-%%%%-%%%%"); // Important: no actual file is created

CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store);

BOOST_CHECK_EQUAL(current.load(), "2");
BOOST_CHECK(!fs::exists(backup.path()));
}

BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case1) {
trace_test();

// Case 1: current and backup both exist and are non-empty

TestFile current("current_%%%%-%%%%-%%%%", "1");
TestFile backup("backup_%%%%-%%%%-%%%%", "X");

CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store);

BOOST_CHECK_EQUAL(current.load(), "2");
BOOST_CHECK_EQUAL(backup.load(), "1");
}

BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case2) {
trace_test();

// Case 2: current and backup both exist, and current is empty

TestFile current("current_%%%%-%%%%-%%%%", ""); // Important: file is created empty
TestFile backup("backup_%%%%-%%%%-%%%%", "X");

CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store);

BOOST_CHECK_EQUAL(current.load(), "2");
BOOST_CHECK_EQUAL(backup.load(), "X");
}

BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case3) {
trace_test();

// Case 3: current exists, but no backup is available

TestFile current("current_%%%%-%%%%-%%%%", "1");
TestFile backup("backup_%%%%-%%%%-%%%%"); // Important: no actual file is created

CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store);

BOOST_CHECK_EQUAL(current.load(), "2");
BOOST_CHECK_EQUAL(backup.load(), "1");
}

BOOST_AUTO_TEST_SUITE_END()

0 comments on commit e53d71b

Please sign in to comment.