Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace checkpoint file atomically ECFLOW-1925 #51

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ ecbuild_add_test(
INCLUDES
src
SOURCES
test/TestCheckPtSaver.cpp
test/TestServerEnvironment.cpp
test/TestServer_main.cpp # test entry point
test/TestServer1.cpp
Expand Down
34 changes: 13 additions & 21 deletions Server/src/CheckPtSaver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,28 +99,20 @@ void CheckPtSaver::terminate() {
}

bool CheckPtSaver::explicitSave(bool from_server) const {
bool ret = true;

try {
#ifdef DEBUG_CHECKPT
std::cout << " CheckPtSaver::explicitSave() Saving checkpt file " << serverEnv_->checkPtFilename() << "\n";
#endif
// Time how long we take to checkpt, Help to recognise *SLOW* disk, which can *AFFECT* server performance
DurationTimer durationTimer;
// In order to detect *SLOW* operations (i.e. disk related), which *AFFECT* server performance,
// the following timer measures how long we take to store the CheckPoint file.
// After the Definitions are stored, if a configured warning threshold is passed a warning is issues.
DurationTimer timer;

// Backup checkpoint file if it exists & is non zero
// Avoid an empty file as a backup file, could results from a full file system
// i.e move ecf_checkpt_file --> ecf_backup_checkpt_file
fs::path checkPtFile(serverEnv_->checkPtFilename());
if (fs::exists(checkPtFile) && fs::file_size(checkPtFile) != 0) {

fs::path oldCheckPtFile(serverEnv_->oldCheckPtFilename());
fs::remove(oldCheckPtFile);
fs::rename(checkPtFile, oldCheckPtFile);
}

// write to ecf_checkpt_file, if file system is full this could result in an empty file. ?
server_->defs_->save_as_checkpt(serverEnv_->checkPtFilename());
fs::path oldCheckPtFile(serverEnv_->oldCheckPtFilename());
CheckPtSaver::storeWithBackup(checkPtFile, oldCheckPtFile, [&](const fs::path& file_path) {
server_->defs_->save_as_checkpt(file_path.string());
});

state_change_no_ = Ecf::state_change_no(); // For periodic update only save checkPt if it has changed
modify_change_no_ = Ecf::modify_change_no(); // For periodic update only save checkPt if it has changed
Expand All @@ -131,16 +123,16 @@ bool CheckPtSaver::explicitSave(bool from_server) const {
std::string msg = Str::SVR_CMD();
msg += CtsApi::checkPtDefsArg();
std::stringstream ss;
ss << msg << " in " << durationTimer.duration() << " seconds";
ss << msg << " in " << timer.duration() << " seconds";
log(Log::MSG, ss.str());
}

/// If Save take longer than checkpt_save_time_alarm, then set a flag on server
/// So that user can be aware of it.
if (static_cast<size_t>(durationTimer.duration()) > server_->serverEnv_.checkpt_save_time_alarm()) {
if (static_cast<size_t>(timer.duration()) > server_->serverEnv_.checkpt_save_time_alarm()) {
server_->defs_->flag().set(ecf::Flag::LATE);
std::stringstream ss;
ss << "Check pt save time(" << durationTimer.duration() << ") is greater than alarm time("
ss << "Check pt save time(" << timer.duration() << ") is greater than alarm time("
<< server_->serverEnv_.checkpt_save_time_alarm()
<< "). Excessive save times can interfere with scheduling!";
log(Log::WAR, ss.str());
Expand All @@ -150,15 +142,15 @@ bool CheckPtSaver::explicitSave(bool from_server) const {
#endif
}
catch (std::exception& e) {
ret = false;
std::string msg = "Could not save checkPoint file! ";
msg += e.what();
server_->defs_->flag().set(ecf::Flag::CHECKPT_ERROR);
server_->defs()->set_server().add_or_update_user_variables("ECF_CHECKPT_ERROR", msg);
LOG(Log::ERR, msg);
return false;
}

return ret;
return true;
}

void CheckPtSaver::periodicSaveCheckPt(const boost::system::error_code& error) {
Expand Down
42 changes: 42 additions & 0 deletions Server/src/CheckPtSaver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8

#include <boost/asio.hpp>
#include <boost/filesystem.hpp>

namespace fs = boost::filesystem;

class ServerEnvironment;
class BaseServer;

Expand Down Expand Up @@ -61,6 +65,43 @@ class CheckPtSaver {
/// CheckPt::ALWAYS - will save immediately, may cause performance issues with large Node trees
void saveIfAllowed();

/**
* Stores a set of data in the current file location, ensuring a backup file retains the previous version.
*
* @tparam STORE the type of the function that stores the data
* @param current the file path to the current data, which will become the backup
* @param backup the file path to the previous backup, to be discarded
* @param store the function that actually performs the storage
*/
template <typename STORE>
static void storeWithBackup(const fs::path& current, const fs::path& backup, STORE store) {
// 1. Save the Definitions to a temporary CheckPoint file
//
// In case an issue occurs while saving the CheckPoint file (e.g. thrown
// exception, or server crash), the procedure is always interrupted without
// changes to the current+old CheckPoint files in order to facilitate recovery.
//
fs::path temporary = fs::unique_path(current.string() + "%%%%-%%%%-%%%%-%%%%");
store(temporary.string());

// 2. Backup the current CheckPoint files
//
// Important: The current checkpoint is only backed up if it exists
// and has size greater than zero bytes. This approach avoid backing up
// empty files, resulting from a full file system, and overriding relevant
// old content that might be otherwise available.
//
// The backup effectively replaces the old with the current CheckPoint file.
//
if (fs::exists(current) && fs::file_size(current) > 0) {
fs::remove(backup);
fs::rename(current, backup);
}

// 3. Promote the temporary to current CheckPoint file
fs::rename(temporary, current);
}

private:
/// save the node tree in the server to a checkPt file.
/// this is controlled by the configuration. If the configuration does not
Expand All @@ -84,4 +125,5 @@ class CheckPtSaver {
mutable unsigned int state_change_no_; // detect state change in defs
mutable unsigned int modify_change_no_; // detect state change in defs
};

#endif
128 changes: 128 additions & 0 deletions Server/test/TestCheckPtSaver.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/*
* Copyright 2023- ECMWF.
*
* This software is licensed under the terms of the Apache Licence version 2.0
* which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
* In applying this licence, ECMWF does not waive the privileges and immunities
* granted to it by virtue of its status as an intergovernmental organisation
* nor does it submit to any jurisdiction.
*/

#include <fstream>
#include <iostream>

#include <boost/test/unit_test.hpp>

#include "CheckPtSaver.hpp"

// TODO: Move the following trace_*_function to a common test module/library
static void trace_test() {
std::cout << "..." << boost::unit_test::framework::current_test_case().full_name() << "\n";
}

/**
* A self cleaning test file, useful to automate test data storage/cleanup, with automatic generation of file names.
*/
class TestFile {
public:
/**
* Create a handle for a 'non-existent' data file
*/
explicit TestFile(const std::string& basename) : absolute_path_(TestFile::unique_absolute_path(basename)) {}

/**
* Create a handle for a data file, with the given content
*/
TestFile(const std::string& basename, const std::string& content)
: absolute_path_(TestFile::unique_absolute_path(basename)) {
f_store(absolute_path_.string(), content);
}
~TestFile() { fs::remove(absolute_path_); }

[[nodiscard]] const fs::path& path() const { return absolute_path_; }
[[nodiscard]] std::string load() const { return f_load(absolute_path_); }

static fs::path unique_absolute_path(const std::string& basename) {
return fs::absolute(fs::unique_path(basename));
}

private:
static void f_store(const fs::path& outfile, const std::string& data) {
std::ofstream ofs(outfile.string());
ofs << data;
}

static std::string f_load(const fs::path& infile) {
std::ifstream ifs(infile.string());
std::string buffer((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
return buffer;
}

private:
fs::path absolute_path_;
};

void handle_store(const fs::path& temporary) {
std::ofstream ofs(temporary.string());
ofs << "2";
};

BOOST_AUTO_TEST_SUITE(TestServer)

BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case0) {
trace_test();

// Case 0: neither current and backup exist

TestFile current("current_%%%%-%%%%-%%%%"); // Important: no actual file is created
TestFile backup("backup_%%%%-%%%%-%%%%"); // Important: no actual file is created

CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store);

BOOST_CHECK_EQUAL(current.load(), "2");
BOOST_CHECK(!fs::exists(backup.path()));
}

BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case1) {
trace_test();

// Case 1: current and backup both exist and are non-empty

TestFile current("current_%%%%-%%%%-%%%%", "1");
TestFile backup("backup_%%%%-%%%%-%%%%", "X");

CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store);

BOOST_CHECK_EQUAL(current.load(), "2");
BOOST_CHECK_EQUAL(backup.load(), "1");
}

BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case2) {
trace_test();

// Case 2: current and backup both exist, and current is empty

TestFile current("current_%%%%-%%%%-%%%%", ""); // Important: file is created empty
TestFile backup("backup_%%%%-%%%%-%%%%", "X");

CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store);

BOOST_CHECK_EQUAL(current.load(), "2");
BOOST_CHECK_EQUAL(backup.load(), "X");
}

BOOST_AUTO_TEST_CASE(test_checkpt_store_successful_case3) {
trace_test();

// Case 3: current exists, but no backup is available

TestFile current("current_%%%%-%%%%-%%%%", "1");
TestFile backup("backup_%%%%-%%%%-%%%%"); // Important: no actual file is created

CheckPtSaver::storeWithBackup(current.path(), backup.path(), handle_store);

BOOST_CHECK_EQUAL(current.load(), "2");
BOOST_CHECK_EQUAL(backup.load(), "1");
}

BOOST_AUTO_TEST_SUITE_END()
Loading