From 651aa06b476e0b730fad954301249467da6c5442 Mon Sep 17 00:00:00 2001 From: Shuhao Wu Date: Fri, 15 Mar 2024 15:16:49 -0400 Subject: [PATCH] Added a string interner --- .clang-tidy | 4 +- CMakeLists.txt | 3 +- include/cactus_rt/config.h | 2 +- include/cactus_rt/tracing/trace_aggregator.h | 6 + .../cactus_rt/tracing/utils/string_interner.h | 49 +++++++ src/cactus_rt/cyclic_thread.cc | 2 +- .../tracing/utils/string_interner.cc | 19 +++ tests/CMakeLists.txt | 2 + tests/tracing/string_interner_benchmark.cc | 43 ++++++ tests/tracing/string_interner_test.cc | 123 ++++++++++++++++++ 10 files changed, 249 insertions(+), 4 deletions(-) create mode 100644 include/cactus_rt/tracing/utils/string_interner.h create mode 100644 src/cactus_rt/tracing/utils/string_interner.cc create mode 100644 tests/tracing/string_interner_benchmark.cc create mode 100644 tests/tracing/string_interner_test.cc diff --git a/.clang-tidy b/.clang-tidy index 8fa807d..e3953db 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -11,6 +11,7 @@ cert-*, -bugprone-implicit-widening-of-multiplication-result, -bugprone-narrowing-conversions, -clang-diagnostic-unused-parameter, +-misc-include-cleaner, -misc-non-private-member-variables-in-classes, -modernize-pass-by-value, -modernize-use-nodiscard, @@ -19,7 +20,8 @@ cert-*, -readability-function-cognitive-complexity, -readability-identifier-length, -readability-isolate-declaration, --readability-magic-numbers' +-readability-magic-numbers, +-readability-redundant-inline-specifier' # TODO: Re-enable bugprone-exception-escape when no longer throwing # https://github.com/isocpp/CppCoreGuidelines/issues/1589 WarningsAsErrors: '*' diff --git a/CMakeLists.txt b/CMakeLists.txt index cdcf6a9..795f9c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,10 +135,11 @@ target_compile_definitions(cactus_rt PUBLIC QUILL_USE_BOUNDED_QUEUE) if (ENABLE_TRACING) target_sources(cactus_rt PRIVATE - src/cactus_rt/tracing/tracing_enabled.cc src/cactus_rt/tracing/sink.cc src/cactus_rt/tracing/thread_tracer.cc src/cactus_rt/tracing/trace_aggregator.cc + src/cactus_rt/tracing/tracing_enabled.cc + src/cactus_rt/tracing/utils/string_interner.cc ) target_link_libraries(cactus_rt diff --git a/include/cactus_rt/config.h b/include/cactus_rt/config.h index 2739326..075e69e 100644 --- a/include/cactus_rt/config.h +++ b/include/cactus_rt/config.h @@ -105,7 +105,7 @@ struct ThreadTracerConfig { */ struct ThreadConfig { // A vector of CPUs this thread should run on. If empty, no CPU restrictions are set. - std::vector cpu_affinity = {}; + std::vector cpu_affinity; // The size of the stack for this thread. Defaults to 8MB. size_t stack_size = 8 * 1024 * 1024; diff --git a/include/cactus_rt/tracing/trace_aggregator.h b/include/cactus_rt/tracing/trace_aggregator.h index 0dd47be..f173512 100644 --- a/include/cactus_rt/tracing/trace_aggregator.h +++ b/include/cactus_rt/tracing/trace_aggregator.h @@ -17,6 +17,7 @@ #include "sink.h" #include "thread_tracer.h" +#include "utils/string_interner.h" namespace cactus_rt::tracing { @@ -56,6 +57,11 @@ class TraceAggregator { // The list of packets only grow here (although shouldn't grow that much). std::list sticky_trace_packets_; + // These are the interners for the event name and event categories to save + // space on the output. + utils::StringInterner event_name_interner_; + utils::StringInterner event_category_interner_; + // This is a map of trusted_sequence_id to InternedData. // // The InternedData is allocated directly here and kept for the duration of diff --git a/include/cactus_rt/tracing/utils/string_interner.h b/include/cactus_rt/tracing/utils/string_interner.h new file mode 100644 index 0000000..72a90ae --- /dev/null +++ b/include/cactus_rt/tracing/utils/string_interner.h @@ -0,0 +1,49 @@ +#ifndef CACTUS_TRACING_STRING_INTERNER_H_ +#define CACTUS_TRACING_STRING_INTERNER_H_ + +#include +#include +#include +#include +#include + +namespace cactus_rt::tracing::utils { + +/** + * @private + * + * This class interns strings to become ids so the ids can be written to the + * trace file to save space. + * + * It is designed to convert the same string content to uint64_t. A string could + * be a const char*, std::string_view or std::string. If const char* is used, + * the string content is copied and stored in this class the first time to + * ensure it is not deleted during the lifetime of this class. + * + * This class is NOT thread-safe. + * + * TODO: is having a list of strings as references manually necessary? Can we + * not simply have std::string as the map key. When looking up values, does C++ + * convert const char* to string view which has a hash code and thus should find + * the string?... Why is C++ so confusing for simple stuff? It looks like this + * feature only exist in C++20 and above? Heterogenous lookup: + * http://wg21.link/P0919r0. + * + * So for now we are stuck with this. + */ +class StringInterner { + uint64_t current_id_ = 0; + std::unordered_map ids_; + std::list strings_; + + public: + std::pair GetId(const std::string_view& s); + std::pair GetId(const char* s); + + size_t Size() const { + return strings_.size(); + }; +}; +} // namespace cactus_rt::tracing::utils + +#endif diff --git a/src/cactus_rt/cyclic_thread.cc b/src/cactus_rt/cyclic_thread.cc index d560e94..3c0366e 100644 --- a/src/cactus_rt/cyclic_thread.cc +++ b/src/cactus_rt/cyclic_thread.cc @@ -25,7 +25,7 @@ void CyclicThread::Run() noexcept { Tracer().StartSpan("Loop", "cactusrt", loop_start); } - bool should_stop = Loop(loop_start - Thread::StartMonotonicTimeNs()); + const bool should_stop = Loop(loop_start - Thread::StartMonotonicTimeNs()); loop_end = NowNs(); diff --git a/src/cactus_rt/tracing/utils/string_interner.cc b/src/cactus_rt/tracing/utils/string_interner.cc new file mode 100644 index 0000000..7411bf1 --- /dev/null +++ b/src/cactus_rt/tracing/utils/string_interner.cc @@ -0,0 +1,19 @@ +#include "cactus_rt/tracing/utils/string_interner.h" + +namespace cactus_rt::tracing::utils { +std::pair StringInterner::GetId(const std::string_view& s) { + if (auto it = ids_.find(s); it != ids_.end()) { + return std::make_pair(false, it->second); + } + + const auto& copied_str = strings_.emplace_back(s); + const auto id = ++current_id_; + ids_.emplace(copied_str, id); + + return std::make_pair(true, id); +} + +std::pair StringInterner::GetId(const char* s) { + return GetId(std::string_view{s}); +} +} // namespace cactus_rt::tracing::utils diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 8cfc157..fc47cc9 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -21,6 +21,7 @@ if(ENABLE_TRACING) add_executable(cactus_rt_tracing_tests tracing/single_threaded_test.cc + tracing/string_interner_test.cc tracing/multi_threaded_test.cc tracing/helpers/assert_helpers.cc @@ -45,6 +46,7 @@ find_package(benchmark REQUIRED) add_executable(cactus_rt_tracing_benchmark tracing/tracing_benchmark.cc + tracing/string_interner_benchmark.cc ) target_link_libraries(cactus_rt_tracing_benchmark diff --git a/tests/tracing/string_interner_benchmark.cc b/tests/tracing/string_interner_benchmark.cc new file mode 100644 index 0000000..c7c8e98 --- /dev/null +++ b/tests/tracing/string_interner_benchmark.cc @@ -0,0 +1,43 @@ +#include +#include + +namespace { + +using cactus_rt::tracing::utils::StringInterner; + +StringInterner SetupInterner() { + StringInterner interner; + interner.GetId("hello1"); + interner.GetId("hello2"); + interner.GetId("hello3"); + interner.GetId("hello4"); + interner.GetId("hello5"); + interner.GetId("hello6"); + interner.GetId("hello7"); + interner.GetId("hello8"); + interner.GetId("hello9"); + return interner; +} + +void BM_StringInternerConstCharArr(benchmark::State& state) { + auto interner = SetupInterner(); + + for (auto _ : state) { + interner.GetId("hello1"); + } +} +BENCHMARK(BM_StringInternerConstCharArr); + +void BM_StringInternerStdLongString(benchmark::State& state) { + auto interner = SetupInterner(); + std::string s; + for (int i = 0; i < 500; i++) { + s.append("hello "); + } + + for (auto _ : state) { + interner.GetId(s); + } +} +BENCHMARK(BM_StringInternerStdLongString); +} // namespace diff --git a/tests/tracing/string_interner_test.cc b/tests/tracing/string_interner_test.cc new file mode 100644 index 0000000..1be87fd --- /dev/null +++ b/tests/tracing/string_interner_test.cc @@ -0,0 +1,123 @@ +#include "cactus_rt/tracing/utils/string_interner.h" + +#include + +#include +#include +#include + +using cactus_rt::tracing::utils::StringInterner; + +namespace cactus_rt::tracing::utils { + +TEST(StringInternerTest, BasicAssertions) { + StringInterner interner; + + const char* const_char_arr = "hello"; + char char_arr[sizeof(const_char_arr)]; // NOLINT(modernize-avoid-c-arrays,bugprone-sizeof-expression) + strncpy(char_arr, const_char_arr, sizeof(char_arr)); + + const std::string_view sv{const_char_arr}; + const std::string str{const_char_arr}; + + const auto [new1, id1] = interner.GetId("hello"); + const auto [new2, id2] = interner.GetId(const_char_arr); + const auto [new3, id3] = interner.GetId(char_arr); + const auto [new4, id4] = interner.GetId(sv); + const auto [new5, id5] = interner.GetId(str); + + EXPECT_NE(const_char_arr, char_arr); // Just a sanity check... + + // Everything should have the same id + EXPECT_EQ(id1, id2); + EXPECT_EQ(id1, id3); + EXPECT_EQ(id1, id4); + EXPECT_EQ(id1, id5); + + // Only one of them should be inserting. + EXPECT_TRUE(new1); + EXPECT_FALSE(new2); + EXPECT_FALSE(new3); + EXPECT_FALSE(new4); + EXPECT_FALSE(new5); + + // We expect the size of the strings memoized to be 1. + EXPECT_EQ(interner.Size(), 1); +} + +TEST(StringInternerTest, ManyStrings) { + StringInterner interner; + + const auto [new1, id1] = interner.GetId("hello1"); + const auto [new2, id2] = interner.GetId("hello2"); + const auto [new3, id3] = interner.GetId("hello3"); + const auto [new4, id4] = interner.GetId("hello4"); + const auto [new5, id5] = interner.GetId("hello5"); + const auto [new6, id6] = interner.GetId("hello6"); + const auto [new7, id7] = interner.GetId("hello7"); + const auto [new8, id8] = interner.GetId("hello8"); + const auto [new9, id9] = interner.GetId("hello9"); + + const std::unordered_set s{id1, id2, id3, id4, id5, id6, id7, id8, id9}; + + EXPECT_EQ(s.size(), 9); + EXPECT_EQ(interner.Size(), 9); + + for (const auto id : s) { + EXPECT_NE(id, 0); + } + + EXPECT_TRUE(new1); + EXPECT_TRUE(new2); + EXPECT_TRUE(new3); + EXPECT_TRUE(new4); + EXPECT_TRUE(new5); + EXPECT_TRUE(new6); + EXPECT_TRUE(new7); + EXPECT_TRUE(new8); + EXPECT_TRUE(new9); + + const std::string str{"hello1"}; + const auto [new10, id10] = interner.GetId(str.data()); + EXPECT_EQ(id1, id10); + EXPECT_FALSE(new10); + EXPECT_EQ(interner.Size(), 9); +} + +TEST(StringInternerTest, OutOfScope) { + StringInterner interner; + + uint64_t id1 = 0; + bool new1 = false; + uint64_t id2 = 0; + bool new2 = true; // intentionally wrong answer to ensure the expect is working + + { + std::string s{"hello1"}; + const auto [n, i] = interner.GetId(s.data()); + new1 = n; + id1 = i; + s.replace(1, 1, "g"); + } + + { + char c[8]; // NOLINT(modernize-avoid-c-arrays) + strncpy(c, "hello1", sizeof(c)); + const auto [n, i] = interner.GetId(c); + new2 = n; + id2 = i; + c[1] = 'g'; + } + + const auto [new3, id3] = interner.GetId("hello1"); + EXPECT_EQ(id1, id2); + EXPECT_EQ(id1, id3); + + EXPECT_TRUE(new1); + EXPECT_FALSE(new2); + EXPECT_FALSE(new3); + + EXPECT_EQ(interner.Size(), 1); +} + +} // namespace cactus_rt::tracing::utils