From 2573b63ff1b137c6aa9c7554a7b105cf816c26ae Mon Sep 17 00:00:00 2001 From: Cesare Ferrari Date: Thu, 10 Oct 2024 19:42:52 +0100 Subject: [PATCH] Update SimpleStringDictionary to improve the performance when adding new strings. The string handles are stored in an insertion sorted array, so lookup is now O(log N). --- containers/choc_Value.h | 88 ++++++++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 22 deletions(-) diff --git a/containers/choc_Value.h b/containers/choc_Value.h index bd27f7a..fd981c2 100644 --- a/containers/choc_Value.h +++ b/containers/choc_Value.h @@ -428,25 +428,36 @@ class StringDictionary //============================================================================== /** A simple implementation of StringDictionary. This should have good performance for typical-sized dictionaries. - Adding new strings will require O(n) time where n = dictionary size, but + Adding new strings will require O(log n) time where n = dictionary size, but retrieving the string for a handle is fast with O(1). */ struct SimpleStringDictionary : public StringDictionary { SimpleStringDictionary() = default; - SimpleStringDictionary (const SimpleStringDictionary& other) : strings (other.strings) {} - SimpleStringDictionary (SimpleStringDictionary&& other) : strings (std::move (other.strings)) {} - SimpleStringDictionary& operator= (const SimpleStringDictionary& other) { strings = other.strings; return *this; } - SimpleStringDictionary& operator= (SimpleStringDictionary&& other) { strings = std::move (other.strings); return *this; } + SimpleStringDictionary (const SimpleStringDictionary& other) : strings (other.strings), stringMap (other.stringMap) {} + SimpleStringDictionary (SimpleStringDictionary&& other) : strings (std::move (other.strings)), stringMap (std::move (other.stringMap)) {} + SimpleStringDictionary& operator= (const SimpleStringDictionary& other) { strings = other.strings; stringMap = other.stringMap; return *this; } + SimpleStringDictionary& operator= (SimpleStringDictionary&& other) { strings = std::move (other.strings); stringMap = std::move (other.stringMap); return *this; } Handle getHandleForString (std::string_view) override; std::string_view getStringForHandle (Handle handle) const override; + bool empty() const { return strings.empty(); } void clear(); + size_t getRawDataSize() const { return strings.size(); } + const char* getRawData() const { return strings.data(); } + + void setRawData (const void*, size_t); + +private: + std::pair::const_iterator, bool> findGreaterThanOrEqual (std::string_view) const; + /// The strings are stored in a single chunk, which can be saved and - /// reloaded if necessary. + /// reloaded if necessary. The stringMap is a sorted vector of handles + /// supporting fast lookup of strings in the map std::vector strings; + std::vector stringMap; }; //============================================================================== @@ -2924,11 +2935,10 @@ template void Value::serialise (OutputStream& o) const { o.write (getRawData(), value.type.getValueDataSize()); - if (auto stringDataSize = static_cast (dictionary.strings.size())) + if (auto stringDataSize = static_cast (dictionary.getRawDataSize())) { - CHOC_ASSERT (dictionary.strings.back() == 0); Type::SerialisationHelpers::writeVariableLengthInt (o, stringDataSize); - o.write (dictionary.strings.data(), stringDataSize); + o.write (dictionary.getRawData(), stringDataSize); } } } @@ -2953,9 +2963,10 @@ inline Value Value::deserialise (InputData& input) { auto stringDataSize = Type::SerialisationHelpers::readVariableLengthInt (input); Type::SerialisationHelpers::expect (stringDataSize <= static_cast (input.end - input.start)); - v.dictionary.strings.resize (stringDataSize); - std::memcpy (v.dictionary.strings.data(), input.start, stringDataSize); - Type::SerialisationHelpers::expect (v.dictionary.strings.back() == 0); + v.dictionary.setRawData (input.start, stringDataSize); +// v.dictionary.strings.resize (stringDataSize); +// std::memcpy (v.dictionary.strings.data(), input.start, stringDataSize); +// Type::SerialisationHelpers::expect (v.dictionary.strings.back() == 0); } return v; @@ -3213,20 +3224,20 @@ inline SimpleStringDictionary::Handle SimpleStringDictionary::getHandleForString if (text.empty()) return {}; - for (size_t i = 0; i < strings.size(); ++i) - { - std::string_view sv (strings.data() + i); - - if (text == sv) - return { static_cast (i + 1) }; + auto i = findGreaterThanOrEqual (text); - i += sv.length(); - } + if (i.second) + return { *i.first }; auto newHandle = static_cast (strings.size() + 1); - strings.reserve (strings.size() + text.length() + 1); + + if (strings.size() > 100 && (strings.capacity() < (strings.size() + text.length() + 1))) + strings.reserve (strings.size() + 1000); + strings.insert (strings.end(), text.begin(), text.end()); strings.push_back (0); + + stringMap.insert (i.first, newHandle); return { newHandle }; } @@ -3241,8 +3252,41 @@ inline std::string_view SimpleStringDictionary::getStringForHandle (Handle handl return std::string_view (strings.data() + (handle.handle - 1)); } -inline void SimpleStringDictionary::clear() { strings.clear(); } +inline void SimpleStringDictionary::clear() { strings.clear(); stringMap.clear(); } + +inline void SimpleStringDictionary::setRawData (const void* p, size_t n) +{ + strings.resize (n); + std::memcpy (strings.data(), p, n); + + // Populate string map + for (size_t i = 0; i < strings.size(); ++i) + { + std::string_view sv (strings.data() + i); + auto v = findGreaterThanOrEqual (sv); + stringMap.insert (v.first, static_cast (i)); + i += sv.length(); + } +} + +inline std::pair::const_iterator, bool> SimpleStringDictionary::findGreaterThanOrEqual (std::string_view v) const +{ + bool exactMatch = false; + + auto it = std::lower_bound(stringMap.begin(), stringMap.end(), v, [&] (uint32_t i, std::string_view sv) -> bool + { + auto c = sv.compare (getStringForHandle ( { i })); + + if (c == 0) + exactMatch = true; + + return c > 0; + }); + + return std::pair (it, exactMatch); +} } // namespace choc::value + #endif // CHOC_VALUE_POOL_HEADER_INCLUDED