Skip to content

Commit

Permalink
Rewrite the tokenizer as a view.
Browse files Browse the repository at this point in the history
Signed-off-by: Johannes Kalmbach <[email protected]>
  • Loading branch information
joka921 committed Jan 10, 2025
1 parent 8c8a1a1 commit 0369de6
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 59 deletions.
2 changes: 1 addition & 1 deletion src/index/IndexImpl.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
std::string_view textView = text;
textView = textView.substr(0, textView.rfind('"'));
textView.remove_prefix(1);
TokenizeAndNormalizeText normalizedWords(textView, localeManager);
auto normalizedWords = tokenizeAndNormalizeText(textView, localeManager);
for (auto word : normalizedWords) {
WordsFileLine wordLine{word, false, contextId, 1};
co_yield wordLine;
Expand Down
19 changes: 0 additions & 19 deletions src/parser/WordsAndDocsFileParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,6 @@
#include "util/Exception.h"
#include "util/StringUtils.h"

// _____________________________________________________________________________
void TokenizeAndNormalizeText::start() {
if (current_ != end_) {
currentValue_ = normalizeToken(*current_);
} else {
currentValue_ = std::nullopt;
}
}

// _____________________________________________________________________________
void TokenizeAndNormalizeText::next() {
++current_;
if (current_ != end_) {
currentValue_ = normalizeToken(*current_);
} else {
currentValue_ = std::nullopt;
}
}

// _____________________________________________________________________________
WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile,
LocaleManager localeManager)
Expand Down
53 changes: 15 additions & 38 deletions src/parser/WordsAndDocsFileParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "global/Id.h"
#include "index/StringSortComparator.h"
#include "util/Iterators.h"
#include "util/Views.h"

using std::string;
// Represents a line from the wordsfile.tsv, which stores everything given in
Expand Down Expand Up @@ -105,44 +106,20 @@ struct LiteralsTokenizationDelimiter {
// obj = TokenizeAndNormalizeText{text, localeManager}
// for (auto normalizedWord : obj) { code }
// The type of the value returned when iterating is std::string
class TokenizeAndNormalizeText
: public ad_utility::InputRangeMixin<TokenizeAndNormalizeText> {
public:
using StorageType = std::string;
explicit TokenizeAndNormalizeText(std::string_view text,
LocaleManager localeManager)
: splitter_{absl::StrSplit(text, LiteralsTokenizationDelimiter{},
absl::SkipEmpty{})},
current_{splitter_.begin()},
end_{splitter_.end()},
localeManager_(std::move(localeManager)){};

// Delete unsafe constructors
TokenizeAndNormalizeText() = delete;
TokenizeAndNormalizeText(const TokenizeAndNormalizeText&) = delete;
TokenizeAndNormalizeText& operator=(const TokenizeAndNormalizeText&) = delete;

private:
using Splitter = decltype(absl::StrSplit(
std::string_view{}, LiteralsTokenizationDelimiter{}, absl::SkipEmpty{}));
Splitter splitter_;
Splitter::const_iterator current_;
Splitter::const_iterator end_;

std::optional<StorageType> currentValue_;

LocaleManager localeManager_;

std::string normalizeToken(std::string_view token) {
return localeManager_.getLowercaseUtf8(token);
}

public:
void start();
bool isFinished() const { return !currentValue_.has_value(); };
const StorageType& get() const { return *currentValue_; };
void next();
};
// TODO<flixtastic> Adapt the comment (it is now a function, and you call it a
// little bit differently)
// TODO<flixtastic> Also comment about the lifetime (the `text` and the
// `localeManager` have to be kept alive while the tokenizer is being used, the
// tokenizer only uses references.
inline auto tokenizeAndNormalizeText(std::string_view text,
const LocaleManager& localeManager) {
std::vector<std::string_view> split{
absl::StrSplit(text, LiteralsTokenizationDelimiter{}, absl::SkipEmpty{})};
return ql::views::transform(ad_utility::OwningView{std::move(split)},
[&localeManager](const auto& str) {
return localeManager.getLowercaseUtf8(str);
});
}

// This class is the parent class of WordsFileParser and DocsFileParser and
// it exists to reduce code duplication since the only difference between the
Expand Down
2 changes: 1 addition & 1 deletion test/WordsAndDocsFileParserTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ auto testDocsFileParser = [](std::string docsFilePath,

auto testTokenizeAndNormalizeText = [](std::string testText,
StringVec normalizedTextAsVec) {
TokenizeAndNormalizeText testTokenizer(testText, getLocaleManager());
auto testTokenizer = tokenizeAndNormalizeText(testText, getLocaleManager());
size_t i = 0;
for (auto normalizedWord : testTokenizer) {
ASSERT_TRUE(i < normalizedTextAsVec.size());
Expand Down

0 comments on commit 0369de6

Please sign in to comment.