From 4fa681cdc9c4cb343e879b588bfed1fbfdb7cee8 Mon Sep 17 00:00:00 2001 From: Ken Matsui <26405363+ken-matsui@users.noreply.github.com> Date: Mon, 18 Dec 2023 00:14:00 -0800 Subject: [PATCH] Implement lev distance --- Makefile | 14 ++++- src/Algos.cc | 156 ++++++++++++++++++++++++++++++++++++++++++++++++ src/Algos.hpp | 13 ++++ src/Rustify.hpp | 23 +++++++ 4 files changed, 204 insertions(+), 2 deletions(-) create mode 100644 src/Algos.cc diff --git a/Makefile b/Makefile index 19307c0a2..8286ac95c 100644 --- a/Makefile +++ b/Makefile @@ -29,9 +29,12 @@ $(OUT_DIR): $(OUT_DIR)/Cmd: mkdir -p $@ -$(PROJ_NAME): $(OUT_DIR)/Cmd/Build.o $(OUT_DIR)/Cmd/Test.o $(OUT_DIR)/Cmd/Run.o $(OUT_DIR)/BuildConfig.o $(OUT_DIR)/Logger.o $(OUT_DIR)/TermColor.o $(OUT_DIR)/main.o +$(PROJ_NAME): $(OUT_DIR)/Algos.o $(OUT_DIR)/Cmd/Build.o $(OUT_DIR)/Cmd/Test.o $(OUT_DIR)/Cmd/Run.o $(OUT_DIR)/BuildConfig.o $(OUT_DIR)/Logger.o $(OUT_DIR)/TermColor.o $(OUT_DIR)/main.o $(CC) $(CFLAGS) $^ -o $@ +$(OUT_DIR)/Algos.o: src/Algos.cc src/Algos.hpp + $(CC) $(CFLAGS) -c $< -o $@ + $(OUT_DIR)/TermColor.o: src/TermColor.cc src/TermColor.hpp $(CC) $(CFLAGS) -c $< -o $@ @@ -54,8 +57,9 @@ $(OUT_DIR)/main.o: src/main.cc src/Cmd/Build.hpp $(CC) $(CFLAGS) -c $< -o $@ -test: $(OUT_DIR)/tests $(OUT_DIR)/tests/test_BuildConfig +test: $(OUT_DIR)/tests $(OUT_DIR)/tests/test_BuildConfig $(OUT_DIR)/tests/test_Algos $(OUT_DIR)/tests/test_BuildConfig + $(OUT_DIR)/tests/test_Algos $(OUT_DIR)/tests: mkdir -p $@ @@ -65,3 +69,9 @@ $(OUT_DIR)/tests/test_BuildConfig: $(OUT_DIR)/tests/test_BuildConfig.o $(OUT_DIR $(OUT_DIR)/tests/test_BuildConfig.o: src/BuildConfig.cc src/BuildConfig.hpp src/Rustify.hpp src/Algos.hpp src/Logger.hpp src/TermColor.hpp $(CC) $(CFLAGS) -DPOAC_TEST -c $< -o $@ + +$(OUT_DIR)/tests/test_Algos: $(OUT_DIR)/tests/test_Algos.o $(OUT_DIR)/Logger.o $(OUT_DIR)/TermColor.o + $(CC) $(CFLAGS) $^ -o $@ + +$(OUT_DIR)/tests/test_Algos.o: src/Algos.cc src/Algos.hpp src/Logger.hpp src/TermColor.hpp + $(CC) $(CFLAGS) -DPOAC_TEST -c $< -o $@ diff --git a/src/Algos.cc b/src/Algos.cc new file mode 100644 index 000000000..4c437d494 --- /dev/null +++ b/src/Algos.cc @@ -0,0 +1,156 @@ +#include "Algos.hpp" + +// ref: https://wandbox.org/permlink/zRjT41alOHdwcf00 +static usize levDistance(StringRef a, StringRef b) { + const usize asize = a.size(); + const usize bsize = b.size(); + + // for all i and j, d[i,j] will hold the Levenshtein distance between the + // first i characters of s and the first j characters of t + Vec> d(asize + 1, Vec(bsize + 1)); + d[0][0] = 0; + + // source prefixes can be transformed into empty string by dropping all + // characters + for (usize i = 1; i <= asize; ++i) { + d[i][0] = i; + } + + // target prefixes can be reached from empty source prefix by inserting every + // character + for (usize j = 1; j <= bsize; ++j) { + d[0][j] = j; + } + + for (usize i = 1; i <= asize; ++i) { + for (usize j = 1; j <= bsize; ++j) { + const usize subst_cost = a[i - 1] == b[j - 1] ? 0 : 1; + d[i][j] = std::min({ + d[i - 1][j] + 1, // deletion + d[i][j - 1] + 1, // insertion + d[i - 1][j - 1] + subst_cost // substitution + }); + } + } + + return d[asize][bsize]; +} + +static auto equalsInsensitive(StringRef a, StringRef b) -> bool { + return std::equal( + a.cbegin(), a.cend(), b.cbegin(), b.cend(), + [](char a, char b) { return std::tolower(a) == std::tolower(b); } + ); +} + +Option +findSimilarStr(StringRef lhs, std::span candidates) { + // We need to check if `Candidates` has the exact case-insensitive string + // because the Levenshtein distance match does not care about it. + for (StringRef c : candidates) { + if (equalsInsensitive(lhs, c)) { + return c; + } + } + + // Keep going with the Levenshtein distance match. + // If the LHS size is less than 3, use the LHS size minus 1 and if not, + // use the LHS size divided by 3. + const usize length = lhs.size(); + const usize max_dist = length < 3 ? length - 1 : length / 3; + + Option> similar_str = None; + for (const StringRef c : candidates) { + const usize cur_dist = levDistance(lhs, c); + if (cur_dist <= max_dist) { + // The first similar string found || More similar string found + if (!similar_str.has_value() || cur_dist < similar_str->second) { + similar_str = {c, cur_dist}; + } + } + } + + if (similar_str.has_value()) { + return similar_str->first; + } else { + return None; + } +} + +#ifdef POAC_TEST + +# include +# include + +void test_levDistance() { + // Test bytelength agnosticity + for (char c = 0; c < std::numeric_limits::max(); ++c) { + String str = String(1, c); + assert(levDistance(str, str) == 0); + } +} + +void test_levDistance2() { + constexpr StringRef A = "\nMäry häd ä little lämb\n\nLittle lämb\n"; + constexpr StringRef B = "\nMary häd ä little lämb\n\nLittle lämb\n"; + constexpr StringRef C = "Mary häd ä little lämb\n\nLittle lämb\n"; + + assert(levDistance(A, B) == 2); + assert(levDistance(B, A) == 2); + assert(levDistance(A, C) == 3); + assert(levDistance(C, A) == 3); + assert(levDistance(B, C) == 1); + assert(levDistance(C, B) == 1); + + assert(levDistance("b", "bc") == 1); + assert(levDistance("ab", "abc") == 1); + assert(levDistance("aab", "aabc") == 1); + assert(levDistance("aaab", "aaabc") == 1); + + assert(levDistance("a", "b") == 1); + assert(levDistance("ab", "ac") == 1); + assert(levDistance("aab", "aac") == 1); + assert(levDistance("aaab", "aaac") == 1); +} + +// ref: +// https://github.com/llvm/llvm-project/commit/a247ba9d15635d96225ef39c8c150c08f492e70a#diff-fd993637669817b267190e7de029b75af5a0328d43d9b70c2e8dd512512091a2 + +void test_findSimilarStr() { + constexpr Arr CANDIDATES{"if", "ifdef", "ifndef", + "elif", "else", "endif", + "elifdef", "elifndef"}; + + assert(findSimilarStr("id", CANDIDATES) == "if"sv); + assert(findSimilarStr("ifd", CANDIDATES) == "if"sv); + assert(findSimilarStr("ifde", CANDIDATES) == "ifdef"sv); + assert(findSimilarStr("elf", CANDIDATES) == "elif"sv); + assert(findSimilarStr("elsif", CANDIDATES) == "elif"sv); + assert(findSimilarStr("elseif", CANDIDATES) == "elif"sv); + assert(findSimilarStr("elfidef", CANDIDATES) == "elifdef"sv); + assert(findSimilarStr("elfindef", CANDIDATES) == "elifdef"sv); + assert(findSimilarStr("elfinndef", CANDIDATES) == "elifndef"sv); + assert(findSimilarStr("els", CANDIDATES) == "else"sv); + assert(findSimilarStr("endi", CANDIDATES) == "endif"sv); + + assert(findSimilarStr("i", CANDIDATES) == None); + assert(findSimilarStr("special_compiler_directive", CANDIDATES) == None); +} + +void test_findSimilarStr2() { + constexpr Arr CANDIDATES{"aaab", "aaabc"}; + assert(findSimilarStr("aaaa", CANDIDATES) == "aaab"sv); + assert(findSimilarStr("1111111111", CANDIDATES) == None); + + constexpr Arr CANDIDATES2{"AAAA"}; + assert(findSimilarStr("aaaa", CANDIDATES2) == "AAAA"sv); +} + +int main() { + test_levDistance(); + test_levDistance2(); + test_findSimilarStr(); + test_findSimilarStr2(); +} + +#endif diff --git a/src/Algos.hpp b/src/Algos.hpp index 00e555bc6..7e07ef76c 100644 --- a/src/Algos.hpp +++ b/src/Algos.hpp @@ -4,6 +4,7 @@ #include #include +#include #include template @@ -57,3 +58,15 @@ Vec topoSort( } return res; } + +// ref: https://reviews.llvm.org/differential/changeset/?ref=3315514 +/// Find a similar string in `candidates`. +/// +/// \param lhs a string for a similar string in `Candidates` +/// +/// \param candidates the candidates to find a similar string. +/// +/// \returns a similar string if exists. If no similar string exists, +/// returns None. +Option +findSimilarStr(StringRef lhs, std::span candidates); diff --git a/src/Rustify.hpp b/src/Rustify.hpp index 2d011b066..ed711b015 100644 --- a/src/Rustify.hpp +++ b/src/Rustify.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -13,6 +14,8 @@ #include namespace fs = std::filesystem; +using namespace std::literals::string_literals; +using namespace std::literals::string_view_literals; // NOLINTBEGIN(readability-identifier-naming) using u8 = std::uint8_t; @@ -52,3 +55,23 @@ using HashSet = std::unordered_set; template using Fn = std::function; + +template +using Option = std::optional; + +struct NoneT : protected std::monostate { + constexpr auto operator==(const usize rhs) const -> bool { + return String::npos == rhs; + } + + // NOLINTNEXTLINE(google-explicit-constructor) + constexpr operator std::nullopt_t() const { + return std::nullopt; + } + + template + constexpr operator Option() const { // NOLINT(google-explicit-constructor) + return std::nullopt; + } +}; +inline constexpr NoneT None; // NOLINT(readability-identifier-naming)