Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

James.yang/lexer trie impl #13

Open
wants to merge 23 commits into
base: james.yang/lexer_refactor
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
b664de4
Add trie implementation and lexer with trie
JamesYang007 Jan 18, 2020
be0b02b
Remove unnecessary lexer_routines and rename lexer_trie to lexer
JamesYang007 Jan 18, 2020
5f43218
Update new lexer unittest
JamesYang007 Jan 18, 2020
10269ca
Add cmake changes to build new unittests (no more lexer_routines_unit…
JamesYang007 Jan 18, 2020
9943127
Fix process when string termination given (changed to flush)
JamesYang007 Jan 18, 2020
a18548d
Fix issue with non-backtrack behavior when at root vs. non-root
JamesYang007 Jan 18, 2020
c718ace
Add more unittests and integration tests
JamesYang007 Jan 18, 2020
2498daf
Rename namespace lexer to lex and change CMake and directory structure
JamesYang007 Jan 18, 2020
68b59ed
Modify configuration to fix at release points for libs
JamesYang007 Jan 18, 2020
b242364
Use conan to manage nlohmann/json on linux
JamesYang007 Jan 18, 2020
e6627fb
Add configuration for nlohmann json on linux and mac
JamesYang007 Jan 18, 2020
4a8a1da
Move conan to libs and reconfigure
JamesYang007 Jan 18, 2020
58e8824
Add cstring header for strerror
JamesYang007 Jan 18, 2020
a89bd2f
Reimplement TrieNode to have uniqueptr of children
JamesYang007 Jan 18, 2020
954f449
Remove io unittests for now
JamesYang007 Jan 18, 2020
12385f8
Update google benchmark
JamesYang007 Jan 18, 2020
782126a
Update google benchmark again
JamesYang007 Jan 18, 2020
1456ef6
Update configure to recursively update submodule
JamesYang007 Jan 18, 2020
27f7d05
Add legacy files for benchmarking
JamesYang007 Jan 19, 2020
87b49ad
Readd new token.hpp
JamesYang007 Jan 19, 2020
73210ce
Modifying configuration and benchmark in cmake
JamesYang007 Jan 19, 2020
4cf2493
Finish benchmarking
JamesYang007 Jan 19, 2020
75a459e
Add cmake command line args passable to configure
JamesYang007 Jan 19, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
node_modules
build/
conan*
!conanfile.txt
graph_info.json
*Find*.cmake
20 changes: 15 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@ project("Docgen" VERSION 1.0.0
# This will perform memcheck
include(CTest)

# enables testing
enable_testing()

# This is to make this library portable to other machines.
# This will be used for install.
include(GNUInstallDirs)

# enables testing
enable_testing()

# Set C++17 standard for project target
set(CMAKE_CXX_STANDARD 17)
# Set this such that dependency installation through conan can be found
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/libs)
message("CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}")

# Create DocgenConfigVersion.cmake which contains current project version
# This is supposed to help with (major) version compatibility.
Expand Down Expand Up @@ -54,8 +55,17 @@ set(ETERNAL_DIR ${PROJECT_SOURCE_DIR}/libs/eternal)
# find json library
find_package(nlohmann_json 3.2.0 REQUIRED)

# find google benchmark
find_package(benchmark REQUIRED PATHS ${GBENCH_DIR}/build)

# add libs subdirectory
add_subdirectory(${PROJECT_SOURCE_DIR}/libs ${PROJECT_BINARY_DIR}/libs)

# add src subdirectory
add_subdirectory(${PROJECT_SOURCE_DIR}/src ${PROJECT_BINARY_DIR}/src)

# add benchmark subdirectory
add_subdirectory(${PROJECT_SOURCE_DIR}/benchmark ${PROJECT_BINARY_DIR}/benchmark)

# add test subdirectory
add_subdirectory(${PROJECT_SOURCE_DIR}/test ${PROJECT_BINARY_DIR}/test)
25 changes: 25 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
add_executable(lexer_benchmark
${CMAKE_CURRENT_SOURCE_DIR}/core/lex/lexer_benchmark.cpp
${CMAKE_CURRENT_SOURCE_DIR}/core/lex/lexer_legacy_benchmark.cpp
# Source dependency
$<TARGET_OBJECTS:LEXER_LIB_OBJECTS>
)
target_compile_features(lexer_benchmark PRIVATE cxx_std_17)
target_include_directories(lexer_benchmark PRIVATE
${GBENCH_DIR}/include
${PROJECT_SOURCE_DIR}/src
${ETERNAL_DIR}/include
)
target_link_libraries(lexer_benchmark PRIVATE
benchmark::benchmark
benchmark::benchmark_main
pthread
nlohmann_json::nlohmann_json
)

# copy data directory into where lexer_benchmark executable ends up
add_custom_command(
TARGET lexer_benchmark POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_directory
${CMAKE_CURRENT_SOURCE_DIR}/core/lex/data/
$<TARGET_FILE_DIR:lexer_benchmark>/data)
113 changes: 113 additions & 0 deletions benchmark/core/lex/data/data_1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#include <core/lex/lexer.hpp>

namespace docgen {
namespace core {
namespace lex {

///////////////////////////////////
// Lexer Implementation
///////////////////////////////////

Lexer::Lexer()
: trie_({
{"\n", Symbol::NEWLINE},
{" ", Symbol::WHITESPACE},
{"\t", Symbol::WHITESPACE},
{"\v", Symbol::WHITESPACE},
{"\r", Symbol::WHITESPACE},
{"\f", Symbol::WHITESPACE},
{";", Symbol::SEMICOLON},
{"#", Symbol::HASHTAG},
{"*", Symbol::STAR},
{"{", Symbol::OPEN_BRACE},
{"}", Symbol::CLOSE_BRACE},
{"///", Symbol::BEGIN_SLINE_COMMENT},
{"/*!", Symbol::BEGIN_SBLOCK_COMMENT},
{"//", Symbol::BEGIN_NLINE_COMMENT},
{"/*", Symbol::BEGIN_NBLOCK_COMMENT},
{"*/", Symbol::END_BLOCK_COMMENT},
{"@sdesc", Symbol::SDESC},
{"@tparam", Symbol::TPARAM},
{"@param", Symbol::PARAM},
{"@return", Symbol::RETURN}
})
{}

void Lexer::process(char c)
{
this->update_state();

auto it = trie_.get_children().find(c);

// if transition exists
if (it != trie_.get_children().end()) {
buf_.push_back(c);
trie_.transition(c);
return;
}

// otherwise, no transition exists

// if not backtracking
if (!this->is_backtracking()) {
// if trie at root
if (trie_.is_reset()) {
text_.push_back(c);
return;
}
text_.append(buf_);
buf_.clear();
trie_.reset();
return this->process(c);
}

// otherwise, currently backtracking
this->backtrack(c);
}

void Lexer::backtrack(char c)
{
// tokenize text
this->tokenize_text();

// tokenize symbol
for (uint32_t i = 0; i < buf_.size(); ++i) {
trie_.back_transition();
}
assert(trie_.is_accept());
auto opt_symbol = trie_.get_symbol();
assert(static_cast<bool>(opt_symbol));
status_.tokens.emplace(*opt_symbol);

// move and clear buf_ to temp string for reprocessing
std::string reprocess_str(std::move(buf_));
reprocess_str.push_back(c);

// reset
this->reset();

// reprocess the rest
for (char c : reprocess_str) {
this->process(c);
}
}

void Lexer::flush()
{
this->update_state();

if (this->is_backtracking()) {
return this->backtrack(0);
}

// non-backtracking: no parent is an accepting node
// append buf_ to text_ and tokenize text_
// reset all other fields
text_.append(buf_);
this->tokenize_text();
this->reset();
}

} // namespace lex
} // namespace core
} // namespace docgen
211 changes: 211 additions & 0 deletions benchmark/core/lex/data/data_2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
#pragma once
#include <unordered_map>
#include <string_view>
#include <type_traits>
#include <cassert>
#include <optional>
#include <memory>
#include <exceptions/exceptions.hpp>

namespace docgen {
namespace core {
namespace lex {

template <class SymbolType>
struct Trie
{
private:
struct TrieNode; // forward declaration

public:
using pair_t = std::pair<std::string_view, SymbolType>;

// Constructs trie node from a list of pairs of string and symbol.
// The string must be of type std::string_view and it must not be empty.
// The symbol must be of type SymbolType.
Trie(const std::initializer_list<pair_t>&);

// Delete compiler-generated copy/move ctor/assignment
// This ensures that Trie objects are only (default) constructible.
Trie(const Trie&) =delete;
Trie(Trie&&) =delete;
Trie& operator=(const Trie&) =delete;
Trie& operator=(Trie&&) =delete;

void transition(char c);
void back_transition();
bool is_accept() const;
typename TrieNode::children_t& get_children();
bool is_reset() const;
void reset();
const std::optional<SymbolType>& get_symbol() const;

private:

struct TrieNode
{
using children_t = std::unordered_map<char, std::unique_ptr<TrieNode>>;

// Insert str from current node to update the trie structure.
// The string str is read starting from idx.
void insert(const std::pair<std::string_view, SymbolType>&, size_t = 0);

// Returns if current node is an accepting state.
bool is_accept() const;

// Returns the optional symbol associated with current node.
// Symbol will be active if is_accept is true.
const std::optional<SymbolType>& get_symbol() const;

children_t& get_children();

std::optional<std::reference_wrapper<TrieNode>> get_parent();

private:

enum class State : bool {
accept,
non_accept
};

State state_ = State::non_accept; // indicates accepting node or not
std::optional<SymbolType> symbol_; // symbol for accepting node
children_t children_; // current node's children
TrieNode* parent_ptr_; // current node's parent
};

TrieNode root_; // root of Trie
std::reference_wrapper<TrieNode> curr_node_ = root_; // current node
};

////////////////////////////////////////////////////////////////
// TrieNode Implementation
////////////////////////////////////////////////////////////////

template <class SymbolType>
inline void
Trie<SymbolType>::TrieNode::insert(const pair_t& pair, size_t idx)
{
const auto& str = std::get<0>(pair);

// if string starting from idx is empty, then accepting state
if (str[idx] == '\0') {
state_ = State::accept;
symbol_ = std::get<1>(pair);
}

else {
// if no child with str[idx] mapping
if (children_.find(str[idx]) == children_.end()) {
children_.emplace(str[idx], std::make_unique<TrieNode>());
}
auto& child = children_.at(str[idx]);
child->parent_ptr_ = this;
child->insert(pair, idx + 1);
}
}

template <class SymbolType>
inline bool
Trie<SymbolType>::TrieNode::is_accept() const
{
return state_ == State::accept;
}

template <class SymbolType>
inline const std::optional<SymbolType>&
Trie<SymbolType>::TrieNode::get_symbol() const
{
return symbol_;
}

template <class SymbolType>
inline typename Trie<SymbolType>::TrieNode::children_t&
Trie<SymbolType>::TrieNode::get_children()
{
return children_;
}

template <class SymbolType>
inline std::optional<std::reference_wrapper<typename Trie<SymbolType>::TrieNode>>
Trie<SymbolType>::TrieNode::get_parent()
{
if (parent_ptr_) {
return *parent_ptr_;
}
return {};
}

////////////////////////////////////////////////////////////////
// Trie Implementation
////////////////////////////////////////////////////////////////

template <class SymbolType>
inline
Trie<SymbolType>::Trie(const std::initializer_list<pair_t>& pairs)
: root_()
{
for (auto it = pairs.begin(); it != pairs.end(); ++it) {
if (it->first.empty()) {
throw exceptions::control_flow_error("strings must be non-empty");
}
root_.insert(*it);
}
}

template <class SymbolType>
inline void
Trie<SymbolType>::transition(char c)
{
curr_node_ = *(curr_node_.get().get_children().at(c));
}

template <class SymbolType>
inline bool
Trie<SymbolType>::is_accept() const
{
return curr_node_.get().is_accept();
}

template <class SymbolType>
inline typename Trie<SymbolType>::TrieNode::children_t&
Trie<SymbolType>::get_children()
{
return curr_node_.get().get_children();
}

template <class SymbolType>
inline bool
Trie<SymbolType>::is_reset() const
{
return &(curr_node_.get()) == &root_;
}

template <class SymbolType>
inline void
Trie<SymbolType>::reset()
{
curr_node_ = root_;
}

template <class SymbolType>
inline void
Trie<SymbolType>::back_transition()
{
auto&& opt_parent = curr_node_.get().get_parent();
if (!opt_parent) {
throw exceptions::control_flow_error("Attempt to back transition past the root");
}
curr_node_ = *opt_parent;
}

template <class SymbolType>
inline const std::optional<SymbolType>&
Trie<SymbolType>::get_symbol() const
{
return curr_node_.get().get_symbol();
}

} // namespace lex
} // namespace core
} // namespace docgen
Loading