Skip to content
This repository was archived by the owner on May 9, 2024. It is now read-only.

Replace dictionary proxies with nested dictionaries 15/N #700

Merged
merged 1 commit into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions omniscidb/StringDictionary/StringDictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -970,20 +970,30 @@ bool is_regexp_like(const std::string& str,

std::vector<int32_t> StringDictionary::getRegexpLike(const std::string& pattern,
const char escape,
const size_t generation) const {
CHECK(!base_dict_) << "Not implemented";
int64_t generation) const {
generation = generation >= 0 ? std::min(generation, static_cast<int64_t>(entryCount()))
: static_cast<int64_t>(entryCount());

mapd_lock_guard<mapd_shared_mutex> write_lock(rw_mutex_);
const auto cache_key = std::make_pair(pattern, escape);
const auto cache_key = std::make_tuple(pattern, escape, generation);
const auto it = regex_cache_.find(cache_key);
if (it != regex_cache_.end()) {
return it->second;
}

std::vector<int32_t> result;
if (base_dict_) {
result = base_dict_->getRegexpLike(
pattern, escape, std::min(generation, base_generation_));
if (generation < base_generation_) {
return result;
}
}

std::vector<std::thread> workers;
int worker_count = cpu_threads();
CHECK_GT(worker_count, 0);
std::vector<std::vector<int32_t>> worker_results(worker_count);
CHECK_LE(generation, str_count_);
for (int worker_idx = 0; worker_idx < worker_count; ++worker_idx) {
workers.emplace_back([&worker_results,
&pattern,
Expand All @@ -992,7 +1002,7 @@ std::vector<int32_t> StringDictionary::getRegexpLike(const std::string& pattern,
worker_idx,
worker_count,
this]() {
for (size_t string_id = worker_idx; string_id < generation;
for (int string_id = indexToId(worker_idx); string_id < generation;
string_id += worker_count) {
const auto str = getStringUnlocked(string_id);
if (is_regexp_like(str, pattern, escape)) {
Expand Down
5 changes: 3 additions & 2 deletions omniscidb/StringDictionary/StringDictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ class StringDictionary {

std::vector<int32_t> getRegexpLike(const std::string& pattern,
const char escape,
const size_t generation) const;
int64_t generation = -1) const;

std::vector<std::string> copyStrings(int64_t generation = -1) const;

Expand Down Expand Up @@ -254,7 +254,8 @@ class StringDictionary {
mutable std::map<std::tuple<std::string, bool, bool, char, int64_t>,
std::vector<int32_t>>
like_cache_;
mutable std::map<std::pair<std::string, char>, std::vector<int32_t>> regex_cache_;
mutable std::map<std::tuple<std::string, char, int64_t>, std::vector<int32_t>>
regex_cache_;
mutable std::map<std::string, int32_t> equal_cache_;
mutable DictionaryCache<std::string, compare_cache_value_t> compare_cache_;
mutable std::shared_ptr<std::vector<std::string>> strings_cache_;
Expand Down
30 changes: 30 additions & 0 deletions omniscidb/Tests/StringDictionaryTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -884,6 +884,36 @@ TEST(NestedStringDictionary, GetCompare) {
sortAndCompare(dict2->getCompare("str6", "<>", 1), {0});
}

TEST(NestedStringDictionary, GetRegexpLike) {
auto dict1 =
std::make_shared<StringDictionary>(DictRef{-1, 1}, -1, g_cache_string_hash);
ASSERT_EQ(dict1->getOrAdd("str1"), 0);
ASSERT_EQ(dict1->getOrAdd("str2"), 1);
ASSERT_EQ(dict1->getOrAdd("str3"), 2);

ASSERT_EQ(dict1->getRegexpLike("str.", '\\'), std::vector<int>({0, 1, 2}));
ASSERT_EQ(dict1->getRegexpLike("str.", '\\', 2), std::vector<int>({0, 1}));
ASSERT_EQ(dict1->getRegexpLike("str[124]", '\\'), std::vector<int>({0, 1}));

auto dict2 = std::make_shared<StringDictionary>(dict1, -1, g_cache_string_hash);
ASSERT_EQ(dict1->getOrAdd("str4"), 3);
ASSERT_EQ(dict2->getOrAdd("str5"), 3);
ASSERT_EQ(dict2->getOrAdd("str6"), 4);

ASSERT_EQ(dict1->getRegexpLike("str.", '\\'), std::vector<int>({0, 1, 2, 3}));
ASSERT_EQ(dict1->getRegexpLike("str.", '\\', 2), std::vector<int>({0, 1}));
ASSERT_EQ(dict1->getRegexpLike("str[124]", '\\'), std::vector<int>({0, 1, 3}));

ASSERT_EQ(dict2->getRegexpLike("str.", '\\'), std::vector<int>({0, 1, 2, 3, 4}));
ASSERT_EQ(dict2->getRegexpLike("str.", '\\', 2), std::vector<int>({0, 1}));
ASSERT_EQ(dict2->getRegexpLike("str[12467]", '\\'), std::vector<int>({0, 1, 4}));

ASSERT_EQ(dict1->getOrAdd("str6"), 4);
ASSERT_EQ(dict2->getOrAdd("str7"), 5);

ASSERT_EQ(dict2->getRegexpLike("str[12467]", '\\'), std::vector<int>({0, 1, 4, 5}));
}

TEST(StringDictionaryProxy, BuildIntersectionTranslationMapToOtherProxy) {
// Use existing dictionary from GetBulk
const DictRef dict_ref1(-1, 1);
Expand Down