Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[core] Fix gcs logging #48952

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
4 changes: 0 additions & 4 deletions python/ray/_private/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -1154,14 +1154,10 @@ def start_gcs_server(self):
assert gcs_server_port > 0
assert self._gcs_address is None, "GCS server is already running."
assert self._gcs_client is None, "GCS client is already connected."
# TODO(mwtian): append date time so restarted GCS uses different files.
stdout_file, stderr_file = self.get_log_file_handles("gcs_server", unique=True)
process_info = ray._private.services.start_gcs_server(
self.redis_address,
self._logs_dir,
self.session_name,
stdout_file=stdout_file,
stderr_file=stderr_file,
redis_password=self._ray_params.redis_password,
config=self._config,
fate_share=self.kernel_fate_share,
Expand Down
11 changes: 5 additions & 6 deletions python/ray/_private/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -1443,8 +1443,6 @@ def start_gcs_server(
redis_address: str,
log_dir: str,
session_name: str,
stdout_file: Optional[IO[AnyStr]] = None,
stderr_file: Optional[IO[AnyStr]] = None,
redis_password: Optional[str] = None,
config: Optional[dict] = None,
fate_share: Optional[bool] = None,
Expand All @@ -1458,10 +1456,6 @@ def start_gcs_server(
redis_address: The address that the Redis server is listening on.
log_dir: The path of the dir where log files are created.
session_name: The session name (cluster id) of this cluster.
stdout_file: A file handle opened for writing to redirect stdout to. If
no redirection should happen, then this should be None.
stderr_file: A file handle opened for writing to redirect stderr to. If
no redirection should happen, then this should be None.
redis_password: The password of the redis server.
config: Optional configuration that will
override defaults in RayConfig.
Expand Down Expand Up @@ -1494,6 +1488,11 @@ def start_gcs_server(
]
if redis_password:
command += [f"--redis_password={redis_password}"]

# Logging is fully managed by C++ side spdlog, which supports rotation and file
dentiny marked this conversation as resolved.
Show resolved Hide resolved
# count limitation.
stdout_file = open("/dev/null")
dentiny marked this conversation as resolved.
Show resolved Hide resolved
stderr_file = open("/dev/null")
process_info = start_ray_process(
command,
ray_constants.PROCESS_TYPE_GCS_SERVER,
Expand Down
3 changes: 2 additions & 1 deletion python/ray/includes/global_state_accessor.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ cdef extern from * namespace "ray::gcs" nogil:
ray::RayLog::ShutDownRayLog,
"ray_init",
ray::RayLogLevel::WARNING,
"" /* log_dir */);
"" /* log_dir */,
"" /*log_file */);

RedisClientOptions options(host, port, password, use_ssl);

Expand Down
3 changes: 2 additions & 1 deletion src/ray/common/test/ray_syncer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -937,7 +937,8 @@ int main(int argc, char **argv) {
ray::RayLog::ShutDownRayLog,
argv[0],
ray::RayLogLevel::INFO,
/*log_dir=*/"");
/*log_dir=*/"",
/*log_file=*/"");
ray::RayLog::InstallFailureSignalHandler(argv[0]);
ray::RayLog::InstallTerminateHandler();

Expand Down
3 changes: 2 additions & 1 deletion src/ray/core_worker/test/actor_creator_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ int main(int argc, char **argv) {
ray::RayLog::ShutDownRayLog,
argv[0],
ray::RayLogLevel::INFO,
/*log_dir=*/"");
/*log_dir=*/"",
/*log_file=*/"");
ray::RayLog::InstallFailureSignalHandler(argv[0]);
return RUN_ALL_TESTS();
}
3 changes: 2 additions & 1 deletion src/ray/core_worker/test/direct_actor_transport_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -980,7 +980,8 @@ int main(int argc, char **argv) {
ray::RayLog::ShutDownRayLog,
argv[0],
ray::RayLogLevel::INFO,
/*log_dir=*/"");
/*log_dir=*/"",
/*log_file=*/"");
ray::RayLog::InstallFailureSignalHandler(argv[0]);
return RUN_ALL_TESTS();
}
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,8 @@ int main(int argc, char **argv) {
ray::RayLog::ShutDownRayLog,
argv[0],
ray::RayLogLevel::INFO,
/*log_dir=*/"");
/*log_dir=*/"",
/*log_file=*/"");
::testing::InitGoogleTest(&argc, argv);
RAY_CHECK(argc == 3);
ray::TEST_REDIS_SERVER_EXEC_PATH = argv[1];
Expand Down
3 changes: 2 additions & 1 deletion src/ray/gcs/gcs_client/test/gcs_client_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1087,7 +1087,8 @@ int main(int argc, char **argv) {
ray::RayLog::ShutDownRayLog,
argv[0],
ray::RayLogLevel::INFO,
/*log_dir=*/"");
/*log_dir=*/"",
/*log_file=*/"");
::testing::InitGoogleTest(&argc, argv);
RAY_CHECK(argc == 3);
ray::TEST_REDIS_SERVER_EXEC_PATH = argv[1];
Expand Down
3 changes: 2 additions & 1 deletion src/ray/gcs/gcs_client/test/global_state_accessor_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,8 @@ int main(int argc, char **argv) {
ray::RayLog::ShutDownRayLog,
argv[0],
ray::RayLogLevel::INFO,
/*log_dir=*/"");
/*log_dir=*/"",
/*log_file=*/"");
::testing::InitGoogleTest(&argc, argv);
RAY_CHECK(argc == 3);
ray::TEST_REDIS_SERVER_EXEC_PATH = argv[1];
Expand Down
32 changes: 29 additions & 3 deletions src/ray/gcs/gcs_server/gcs_server_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cstdlib>
#include <iostream>
#include <limits>

#include "gflags/gflags.h"
#include "ray/common/ray_config.h"
Expand All @@ -27,6 +29,7 @@ DEFINE_string(redis_address, "", "The ip address of redis.");
DEFINE_bool(redis_enable_ssl, false, "Use tls/ssl in redis connection.");
DEFINE_int32(redis_port, -1, "The port of redis.");
DEFINE_string(log_dir, "", "The path of the dir where log files are created.");
DEFINE_int64(log_rotation_max_size, 0, "Max size to rotate logs for GCS.");
dentiny marked this conversation as resolved.
Show resolved Hide resolved
DEFINE_int32(gcs_server_port, 0, "The port of gcs server.");
DEFINE_int32(metrics_agent_port, -1, "The port of metrics agent.");
DEFINE_string(config_list, "", "The config list of raylet.");
Expand All @@ -38,17 +41,40 @@ DEFINE_string(session_name,
"session_name: The session name (ClusterID) of the cluster.");
DEFINE_string(ray_commit, "", "The commit hash of Ray.");

namespace {
// GCS server output filename.
constexpr std::string_view kGcsServerLog = "gcs_server.out";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get_log_file_handles can create names like gcs_server.2.out if gcs_server.out and gcs_server.1.out both exists. Do we have such thing in spdlog?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All rotated logs are suffixed with id, similar to what you're described.

} // namespace

int main(int argc, char *argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, true);

// Backward compatibility notes:
// Due to historical reason, GCS server flushes all logging and stdout/stderr to a
dentiny marked this conversation as resolved.
Show resolved Hide resolved
// single file called `gcs_server.out`. To keep backward compatibility at best effort,
// we use the same filename as output, and disable log rotation by default.
const int64_t log_rotation_max_size = FLAGS_log_rotation_size <= 0
? std::numeric_limits<int64_t>::max()
: FLAGS_log_rotation_size;
RAY_CHECK_EQ(setenv(
"RAY_ROTATION_MAX_BYTES", std::to_string(log_rotation_max_size), /*overwrite=*/1));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you explain diff of RAY_ROTATION_MAX_BYTES vs FLAGS_log_rotation_size ? If we already have the former, then we only need to fix existing behavior? I see gcs_server_main.cc already call ray::RayLog::StartRayLog and why does the log rotations in it do not work?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't do anything in this PR, instead only do 2 things:

  1. remove python stdout/stderr redirection
  2. change ray_log_shutdown_raii from /*log_dir=*/"" to /*log_dir=*/FLAGS_log_dir

will the rotations automatically work?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes but the file name will be changed. We want to keep the existing gcs_server.out filename for backward compatibility.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will the rotations automatically work?

To answer your question, passing the log directory works for log rotation.
But one motivation would be backward compatibility, namely keep the gcs_server.out filename.

const std::string log_file =
FLAGS_log_dir.empty() ? kGcsServerLog.data()
: absl::StrFormat("%s/%s", FLAGS_log_dir, kGcsServerLog);
// TODO(hjiang): For the current implementation, we assume all logging are managed by
// spdlog, the caveat is there could be there's writing to stdout/stderr as well. The
// final solution is implement self-customized sink for spdlog, and redirect
// stderr/stdout to the file descritor. Hold until it's confirmed necessary.

InitShutdownRAII ray_log_shutdown_raii(ray::RayLog::StartRayLog,
ray::RayLog::ShutDownRayLog,
argv[0],
ray::RayLogLevel::INFO,
/*log_dir=*/"");
/*log_dir=*/"",
/*log_file=*/log_file);
ray::RayLog::InstallFailureSignalHandler(argv[0]);
ray::RayLog::InstallTerminateHandler();

gflags::ParseCommandLineFlags(&argc, &argv, true);

RAY_LOG(INFO)
.WithField("ray_version", kRayVersion)
.WithField("ray_commit", FLAGS_ray_commit)
Expand Down
3 changes: 2 additions & 1 deletion src/ray/gcs/store_client/test/redis_store_client_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,8 @@ int main(int argc, char **argv) {
ray::RayLog::ShutDownRayLog,
argv[0],
ray::RayLogLevel::INFO,
/*log_dir=*/"");
/*log_dir=*/"",
/*log_file=*/"");
::testing::InitGoogleTest(&argc, argv);
RAY_CHECK(argc == 3);
ray::TEST_REDIS_SERVER_EXEC_PATH = argv[1];
Expand Down
3 changes: 2 additions & 1 deletion src/ray/gcs/test/asio_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ int main(int argc, char **argv) {
ray::RayLog::ShutDownRayLog,
argv[0],
ray::RayLogLevel::INFO,
/*log_dir=*/"");
/*log_dir=*/"",
/*log_file=*/"");
::testing::InitGoogleTest(&argc, argv);
RAY_CHECK(argc == 3);
ray::TEST_REDIS_SERVER_EXEC_PATH = argv[1];
Expand Down
3 changes: 2 additions & 1 deletion src/ray/raylet/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ int main(int argc, char *argv[]) {
ray::RayLog::ShutDownRayLog,
argv[0],
ray::RayLogLevel::INFO,
/*log_dir=*/"");
/*log_dir=*/"",
/*log_file=*/"");
ray::RayLog::InstallFailureSignalHandler(argv[0]);
ray::RayLog::InstallTerminateHandler();

Expand Down
3 changes: 2 additions & 1 deletion src/ray/raylet/worker_pool_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2161,7 +2161,8 @@ int main(int argc, char **argv) {
[]() { ray::RayLog::ShutDownRayLog(); },
argv[0],
ray::RayLogLevel::INFO,
/*log_dir=*/"");
/*log_dir=*/"",
/*log_file=*/"");
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
37 changes: 24 additions & 13 deletions src/ray/util/logging.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ constexpr char kLogFormatJsonPattern[] =
RayLogLevel RayLog::severity_threshold_ = RayLogLevel::INFO;
std::string RayLog::app_name_ = "";
std::string RayLog::component_name_ = "";
std::string RayLog::log_dir_ = "";
bool RayLog::log_format_json_ = false;
std::string RayLog::log_format_pattern_ = kLogFormatTextPattern;

Expand Down Expand Up @@ -312,14 +311,32 @@ void RayLog::InitLogFormat() {
}
}

/*static*/ std::string RayLog::GetLogOutputFilename(const std::string &log_dir,
dentiny marked this conversation as resolved.
Show resolved Hide resolved
const std::string &log_file,
const std::string &app_name) {
if (!log_file.empty()) {
return log_file;
}
if (!log_dir.empty()) {
#ifdef _WIN32
int pid = _getpid();
#else
pid_t pid = getpid();
#endif

return JoinPaths(log_dir, absl::StrFormat("%s_%d.log", app_name, pid));
}
return "";
}

void RayLog::StartRayLog(const std::string &app_name,
RayLogLevel severity_threshold,
const std::string &log_dir) {
const std::string &log_dir,
const std::string &log_file) {
InitSeverityThreshold(severity_threshold);
InitLogFormat();

app_name_ = app_name;
log_dir_ = log_dir;

// All the logging sinks to add.
// One for file/stdout, another for stderr.
Expand All @@ -337,13 +354,9 @@ void RayLog::StartRayLog(const std::string &app_name,
}
}

if (!log_dir_.empty()) {
// Enable log file if log_dir_ is not empty.
#ifdef _WIN32
int pid = _getpid();
#else
pid_t pid = getpid();
#endif
const auto log_output_fname =
GetLogOutputFilename(log_dir, log_file, app_name_without_path);
if (!log_output_fname.empty()) {
// Reset log pattern and level and we assume a log file can be rotated with
// 10 files in max size 512M by default.
if (const char *ray_rotation_max_bytes = std::getenv("RAY_ROTATION_MAX_BYTES");
Expand Down Expand Up @@ -372,9 +385,7 @@ void RayLog::StartRayLog(const std::string &app_name,
spdlog::drop(RayLog::GetLoggerName());
}
auto file_sink = std::make_shared<spdlog::sinks::rotating_file_sink_mt>(
JoinPaths(log_dir_, app_name_without_path + "_" + std::to_string(pid) + ".log"),
log_rotation_max_size_,
log_rotation_file_num_);
log_output_fname, log_rotation_max_size_, log_rotation_file_num_);
file_sink->set_level(level);
sinks[0] = std::move(file_sink);
} else {
Expand Down
17 changes: 12 additions & 5 deletions src/ray/util/logging.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,10 +258,15 @@ class RayLog {
///
/// \parem appName The app name which starts the log.
/// \param severity_threshold Logging threshold for the program.
/// \param logDir Logging output file name. If empty, the log won't output to file.
/// \param log_dir Logging output directory name.
/// \param log_file Logging output file name.
/// Both [log_dir] and [log_file] are used to determine log output filename; if both
/// empty, the log won't output to file, but to stdout. If both set, [log_file] has
/// higher priority than [log_dir].
static void StartRayLog(const std::string &appName,
RayLogLevel severity_threshold = RayLogLevel::INFO,
const std::string &logDir = "");
const std::string &log_dir = "",
const std::string &log_file = "");

/// The shutdown function of ray log which should be used with StartRayLog as a pair.
/// If `StartRayLog` wasn't called before, it will be no-op.
Expand Down Expand Up @@ -305,6 +310,11 @@ class RayLog {
static void AddFatalLogCallbacks(
const std::vector<FatalLogCallback> &expose_log_callbacks);

/// Get log outout filename.
static std::string GetLogOutputFilename(const std::string &log_dir,
const std::string &log_file,
const std::string &app_name);

template <typename T>
RayLog &operator<<(const T &t) {
if (IsEnabled()) {
Expand Down Expand Up @@ -376,9 +386,6 @@ class RayLog {
/// to indicate which component generates the log.
/// This is empty if we log to file.
static std::string component_name_;
/// The directory where the log files are stored.
/// If this is empty, logs are printed to stdout.
static std::string log_dir_;
/// This flag is used to avoid calling UninstallSignalAction in ShutDownRayLog if
/// InstallFailureSignalHandler was not called.
static bool is_failure_signal_handler_installed_;
Expand Down
3 changes: 2 additions & 1 deletion src/ray/util/tests/signal_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ int main(int argc, char **argv) {
ray::RayLog::ShutDownRayLog,
argv[0],
ray::RayLogLevel::INFO,
/*log_dir=*/"");
/*log_dir=*/"",
/*log_file=*/"");
ray::RayLog::InstallFailureSignalHandler(argv[0]);
::testing::InitGoogleTest(&argc, argv);
int failed = RUN_ALL_TESTS();
Expand Down