Skip to content

Commit

Permalink
Added a string interner
Browse files Browse the repository at this point in the history
  • Loading branch information
shuhaowu committed Mar 16, 2024
1 parent 28434fe commit 651aa06
Show file tree
Hide file tree
Showing 10 changed files with 249 additions and 4 deletions.
4 changes: 3 additions & 1 deletion .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ cert-*,
-bugprone-implicit-widening-of-multiplication-result,
-bugprone-narrowing-conversions,
-clang-diagnostic-unused-parameter,
-misc-include-cleaner,
-misc-non-private-member-variables-in-classes,
-modernize-pass-by-value,
-modernize-use-nodiscard,
Expand All @@ -19,7 +20,8 @@ cert-*,
-readability-function-cognitive-complexity,
-readability-identifier-length,
-readability-isolate-declaration,
-readability-magic-numbers'
-readability-magic-numbers,
-readability-redundant-inline-specifier'
# TODO: Re-enable bugprone-exception-escape when no longer throwing
# https://github.com/isocpp/CppCoreGuidelines/issues/1589
WarningsAsErrors: '*'
Expand Down
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,11 @@ target_compile_definitions(cactus_rt PUBLIC QUILL_USE_BOUNDED_QUEUE)
if (ENABLE_TRACING)
target_sources(cactus_rt
PRIVATE
src/cactus_rt/tracing/tracing_enabled.cc
src/cactus_rt/tracing/sink.cc
src/cactus_rt/tracing/thread_tracer.cc
src/cactus_rt/tracing/trace_aggregator.cc
src/cactus_rt/tracing/tracing_enabled.cc
src/cactus_rt/tracing/utils/string_interner.cc
)

target_link_libraries(cactus_rt
Expand Down
2 changes: 1 addition & 1 deletion include/cactus_rt/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ struct ThreadTracerConfig {
*/
struct ThreadConfig {
// A vector of CPUs this thread should run on. If empty, no CPU restrictions are set.
std::vector<size_t> cpu_affinity = {};
std::vector<size_t> cpu_affinity;

// The size of the stack for this thread. Defaults to 8MB.
size_t stack_size = 8 * 1024 * 1024;
Expand Down
6 changes: 6 additions & 0 deletions include/cactus_rt/tracing/trace_aggregator.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "sink.h"
#include "thread_tracer.h"
#include "utils/string_interner.h"

namespace cactus_rt::tracing {

Expand Down Expand Up @@ -56,6 +57,11 @@ class TraceAggregator {
// The list of packets only grow here (although shouldn't grow that much).
std::list<Trace> sticky_trace_packets_;

// These are the interners for the event name and event categories to save
// space on the output.
utils::StringInterner event_name_interner_;
utils::StringInterner event_category_interner_;

// This is a map of trusted_sequence_id to InternedData.
//
// The InternedData is allocated directly here and kept for the duration of
Expand Down
49 changes: 49 additions & 0 deletions include/cactus_rt/tracing/utils/string_interner.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#ifndef CACTUS_TRACING_STRING_INTERNER_H_
#define CACTUS_TRACING_STRING_INTERNER_H_

#include <list>
#include <string>
#include <string_view>
#include <unordered_map>
#include <utility>

namespace cactus_rt::tracing::utils {

/**
* @private
*
* This class interns strings to become ids so the ids can be written to the
* trace file to save space.
*
* It is designed to convert the same string content to uint64_t. A string could
* be a const char*, std::string_view or std::string. If const char* is used,
* the string content is copied and stored in this class the first time to
* ensure it is not deleted during the lifetime of this class.
*
* This class is NOT thread-safe.
*
* TODO: is having a list of strings as references manually necessary? Can we
* not simply have std::string as the map key. When looking up values, does C++
* convert const char* to string view which has a hash code and thus should find
* the string?... Why is C++ so confusing for simple stuff? It looks like this
* feature only exist in C++20 and above? Heterogenous lookup:
* http://wg21.link/P0919r0.
*
* So for now we are stuck with this.
*/
class StringInterner {
uint64_t current_id_ = 0;
std::unordered_map<std::string_view, uint64_t> ids_;
std::list<std::string> strings_;

public:
std::pair<bool, uint64_t> GetId(const std::string_view& s);
std::pair<bool, uint64_t> GetId(const char* s);

size_t Size() const {
return strings_.size();
};
};
} // namespace cactus_rt::tracing::utils

#endif
2 changes: 1 addition & 1 deletion src/cactus_rt/cyclic_thread.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ void CyclicThread::Run() noexcept {
Tracer().StartSpan("Loop", "cactusrt", loop_start);
}

bool should_stop = Loop(loop_start - Thread::StartMonotonicTimeNs());
const bool should_stop = Loop(loop_start - Thread::StartMonotonicTimeNs());

loop_end = NowNs();

Expand Down
19 changes: 19 additions & 0 deletions src/cactus_rt/tracing/utils/string_interner.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#include "cactus_rt/tracing/utils/string_interner.h"

namespace cactus_rt::tracing::utils {
std::pair<bool, uint64_t> StringInterner::GetId(const std::string_view& s) {
if (auto it = ids_.find(s); it != ids_.end()) {
return std::make_pair(false, it->second);
}

const auto& copied_str = strings_.emplace_back(s);
const auto id = ++current_id_;
ids_.emplace(copied_str, id);

return std::make_pair(true, id);
}

std::pair<bool, uint64_t> StringInterner::GetId(const char* s) {
return GetId(std::string_view{s});
}
} // namespace cactus_rt::tracing::utils
2 changes: 2 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ if(ENABLE_TRACING)
add_executable(cactus_rt_tracing_tests

tracing/single_threaded_test.cc
tracing/string_interner_test.cc
tracing/multi_threaded_test.cc

tracing/helpers/assert_helpers.cc
Expand All @@ -45,6 +46,7 @@ find_package(benchmark REQUIRED)

add_executable(cactus_rt_tracing_benchmark
tracing/tracing_benchmark.cc
tracing/string_interner_benchmark.cc
)

target_link_libraries(cactus_rt_tracing_benchmark
Expand Down
43 changes: 43 additions & 0 deletions tests/tracing/string_interner_benchmark.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#include <benchmark/benchmark.h>
#include <cactus_rt/tracing/utils/string_interner.h>

namespace {

using cactus_rt::tracing::utils::StringInterner;

StringInterner SetupInterner() {
StringInterner interner;
interner.GetId("hello1");
interner.GetId("hello2");
interner.GetId("hello3");
interner.GetId("hello4");
interner.GetId("hello5");
interner.GetId("hello6");
interner.GetId("hello7");
interner.GetId("hello8");
interner.GetId("hello9");
return interner;
}

void BM_StringInternerConstCharArr(benchmark::State& state) {
auto interner = SetupInterner();

for (auto _ : state) {
interner.GetId("hello1");
}
}
BENCHMARK(BM_StringInternerConstCharArr);

void BM_StringInternerStdLongString(benchmark::State& state) {
auto interner = SetupInterner();
std::string s;
for (int i = 0; i < 500; i++) {
s.append("hello ");
}

for (auto _ : state) {
interner.GetId(s);
}
}
BENCHMARK(BM_StringInternerStdLongString);
} // namespace
123 changes: 123 additions & 0 deletions tests/tracing/string_interner_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#include "cactus_rt/tracing/utils/string_interner.h"

#include <gtest/gtest.h>

#include <cstring>
#include <iostream>
#include <unordered_set>

using cactus_rt::tracing::utils::StringInterner;

namespace cactus_rt::tracing::utils {

TEST(StringInternerTest, BasicAssertions) {
StringInterner interner;

const char* const_char_arr = "hello";
char char_arr[sizeof(const_char_arr)]; // NOLINT(modernize-avoid-c-arrays,bugprone-sizeof-expression)
strncpy(char_arr, const_char_arr, sizeof(char_arr));

const std::string_view sv{const_char_arr};
const std::string str{const_char_arr};

const auto [new1, id1] = interner.GetId("hello");
const auto [new2, id2] = interner.GetId(const_char_arr);
const auto [new3, id3] = interner.GetId(char_arr);
const auto [new4, id4] = interner.GetId(sv);
const auto [new5, id5] = interner.GetId(str);

EXPECT_NE(const_char_arr, char_arr); // Just a sanity check...

// Everything should have the same id
EXPECT_EQ(id1, id2);
EXPECT_EQ(id1, id3);
EXPECT_EQ(id1, id4);
EXPECT_EQ(id1, id5);

// Only one of them should be inserting.
EXPECT_TRUE(new1);
EXPECT_FALSE(new2);
EXPECT_FALSE(new3);
EXPECT_FALSE(new4);
EXPECT_FALSE(new5);

// We expect the size of the strings memoized to be 1.
EXPECT_EQ(interner.Size(), 1);
}

TEST(StringInternerTest, ManyStrings) {
StringInterner interner;

const auto [new1, id1] = interner.GetId("hello1");
const auto [new2, id2] = interner.GetId("hello2");
const auto [new3, id3] = interner.GetId("hello3");
const auto [new4, id4] = interner.GetId("hello4");
const auto [new5, id5] = interner.GetId("hello5");
const auto [new6, id6] = interner.GetId("hello6");
const auto [new7, id7] = interner.GetId("hello7");
const auto [new8, id8] = interner.GetId("hello8");
const auto [new9, id9] = interner.GetId("hello9");

const std::unordered_set<uint64_t> s{id1, id2, id3, id4, id5, id6, id7, id8, id9};

EXPECT_EQ(s.size(), 9);
EXPECT_EQ(interner.Size(), 9);

for (const auto id : s) {
EXPECT_NE(id, 0);
}

EXPECT_TRUE(new1);
EXPECT_TRUE(new2);
EXPECT_TRUE(new3);
EXPECT_TRUE(new4);
EXPECT_TRUE(new5);
EXPECT_TRUE(new6);
EXPECT_TRUE(new7);
EXPECT_TRUE(new8);
EXPECT_TRUE(new9);

const std::string str{"hello1"};
const auto [new10, id10] = interner.GetId(str.data());
EXPECT_EQ(id1, id10);
EXPECT_FALSE(new10);
EXPECT_EQ(interner.Size(), 9);
}

TEST(StringInternerTest, OutOfScope) {
StringInterner interner;

uint64_t id1 = 0;
bool new1 = false;
uint64_t id2 = 0;
bool new2 = true; // intentionally wrong answer to ensure the expect is working

{
std::string s{"hello1"};
const auto [n, i] = interner.GetId(s.data());
new1 = n;
id1 = i;
s.replace(1, 1, "g");
}

{
char c[8]; // NOLINT(modernize-avoid-c-arrays)
strncpy(c, "hello1", sizeof(c));
const auto [n, i] = interner.GetId(c);
new2 = n;
id2 = i;
c[1] = 'g';
}

const auto [new3, id3] = interner.GetId("hello1");
EXPECT_EQ(id1, id2);
EXPECT_EQ(id1, id3);

EXPECT_TRUE(new1);
EXPECT_FALSE(new2);
EXPECT_FALSE(new3);

EXPECT_EQ(interner.Size(), 1);
}

} // namespace cactus_rt::tracing::utils

0 comments on commit 651aa06

Please sign in to comment.