Skip to content

Commit

Permalink
Implement lev distance
Browse files Browse the repository at this point in the history
  • Loading branch information
ken-matsui committed Dec 18, 2023
1 parent feb04f1 commit 4fa681c
Show file tree
Hide file tree
Showing 4 changed files with 204 additions and 2 deletions.
14 changes: 12 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,12 @@ $(OUT_DIR):
$(OUT_DIR)/Cmd:
mkdir -p $@

$(PROJ_NAME): $(OUT_DIR)/Cmd/Build.o $(OUT_DIR)/Cmd/Test.o $(OUT_DIR)/Cmd/Run.o $(OUT_DIR)/BuildConfig.o $(OUT_DIR)/Logger.o $(OUT_DIR)/TermColor.o $(OUT_DIR)/main.o
$(PROJ_NAME): $(OUT_DIR)/Algos.o $(OUT_DIR)/Cmd/Build.o $(OUT_DIR)/Cmd/Test.o $(OUT_DIR)/Cmd/Run.o $(OUT_DIR)/BuildConfig.o $(OUT_DIR)/Logger.o $(OUT_DIR)/TermColor.o $(OUT_DIR)/main.o
$(CC) $(CFLAGS) $^ -o $@

$(OUT_DIR)/Algos.o: src/Algos.cc src/Algos.hpp
$(CC) $(CFLAGS) -c $< -o $@

$(OUT_DIR)/TermColor.o: src/TermColor.cc src/TermColor.hpp
$(CC) $(CFLAGS) -c $< -o $@

Expand All @@ -54,8 +57,9 @@ $(OUT_DIR)/main.o: src/main.cc src/Cmd/Build.hpp
$(CC) $(CFLAGS) -c $< -o $@


test: $(OUT_DIR)/tests $(OUT_DIR)/tests/test_BuildConfig
test: $(OUT_DIR)/tests $(OUT_DIR)/tests/test_BuildConfig $(OUT_DIR)/tests/test_Algos
$(OUT_DIR)/tests/test_BuildConfig
$(OUT_DIR)/tests/test_Algos

$(OUT_DIR)/tests:
mkdir -p $@
Expand All @@ -65,3 +69,9 @@ $(OUT_DIR)/tests/test_BuildConfig: $(OUT_DIR)/tests/test_BuildConfig.o $(OUT_DIR

$(OUT_DIR)/tests/test_BuildConfig.o: src/BuildConfig.cc src/BuildConfig.hpp src/Rustify.hpp src/Algos.hpp src/Logger.hpp src/TermColor.hpp
$(CC) $(CFLAGS) -DPOAC_TEST -c $< -o $@

$(OUT_DIR)/tests/test_Algos: $(OUT_DIR)/tests/test_Algos.o $(OUT_DIR)/Logger.o $(OUT_DIR)/TermColor.o
$(CC) $(CFLAGS) $^ -o $@

$(OUT_DIR)/tests/test_Algos.o: src/Algos.cc src/Algos.hpp src/Logger.hpp src/TermColor.hpp
$(CC) $(CFLAGS) -DPOAC_TEST -c $< -o $@
156 changes: 156 additions & 0 deletions src/Algos.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#include "Algos.hpp"

// ref: https://wandbox.org/permlink/zRjT41alOHdwcf00
static usize levDistance(StringRef a, StringRef b) {
const usize asize = a.size();
const usize bsize = b.size();

// for all i and j, d[i,j] will hold the Levenshtein distance between the
// first i characters of s and the first j characters of t
Vec<Vec<usize>> d(asize + 1, Vec<usize>(bsize + 1));
d[0][0] = 0;

// source prefixes can be transformed into empty string by dropping all
// characters
for (usize i = 1; i <= asize; ++i) {
d[i][0] = i;
}

// target prefixes can be reached from empty source prefix by inserting every
// character
for (usize j = 1; j <= bsize; ++j) {
d[0][j] = j;
}

for (usize i = 1; i <= asize; ++i) {
for (usize j = 1; j <= bsize; ++j) {
const usize subst_cost = a[i - 1] == b[j - 1] ? 0 : 1;
d[i][j] = std::min({
d[i - 1][j] + 1, // deletion
d[i][j - 1] + 1, // insertion
d[i - 1][j - 1] + subst_cost // substitution
});
}
}

return d[asize][bsize];
}

static auto equalsInsensitive(StringRef a, StringRef b) -> bool {
return std::equal(
a.cbegin(), a.cend(), b.cbegin(), b.cend(),
[](char a, char b) { return std::tolower(a) == std::tolower(b); }
);
}

Option<StringRef>
findSimilarStr(StringRef lhs, std::span<const StringRef> candidates) {
// We need to check if `Candidates` has the exact case-insensitive string
// because the Levenshtein distance match does not care about it.
for (StringRef c : candidates) {
if (equalsInsensitive(lhs, c)) {
return c;
}
}

// Keep going with the Levenshtein distance match.
// If the LHS size is less than 3, use the LHS size minus 1 and if not,
// use the LHS size divided by 3.
const usize length = lhs.size();
const usize max_dist = length < 3 ? length - 1 : length / 3;

Option<std::pair<StringRef, usize>> similar_str = None;
for (const StringRef c : candidates) {
const usize cur_dist = levDistance(lhs, c);
if (cur_dist <= max_dist) {
// The first similar string found || More similar string found
if (!similar_str.has_value() || cur_dist < similar_str->second) {
similar_str = {c, cur_dist};
}
}
}

if (similar_str.has_value()) {
return similar_str->first;
} else {
return None;
}
}

#ifdef POAC_TEST

# include <cassert>
# include <limits>

void test_levDistance() {
// Test bytelength agnosticity
for (char c = 0; c < std::numeric_limits<char>::max(); ++c) {
String str = String(1, c);
assert(levDistance(str, str) == 0);
}
}

void test_levDistance2() {
constexpr StringRef A = "\nMäry häd ä little lämb\n\nLittle lämb\n";
constexpr StringRef B = "\nMary häd ä little lämb\n\nLittle lämb\n";
constexpr StringRef C = "Mary häd ä little lämb\n\nLittle lämb\n";

assert(levDistance(A, B) == 2);
assert(levDistance(B, A) == 2);
assert(levDistance(A, C) == 3);
assert(levDistance(C, A) == 3);
assert(levDistance(B, C) == 1);
assert(levDistance(C, B) == 1);

assert(levDistance("b", "bc") == 1);
assert(levDistance("ab", "abc") == 1);
assert(levDistance("aab", "aabc") == 1);
assert(levDistance("aaab", "aaabc") == 1);

assert(levDistance("a", "b") == 1);
assert(levDistance("ab", "ac") == 1);
assert(levDistance("aab", "aac") == 1);
assert(levDistance("aaab", "aaac") == 1);
}

// ref:
// https://github.com/llvm/llvm-project/commit/a247ba9d15635d96225ef39c8c150c08f492e70a#diff-fd993637669817b267190e7de029b75af5a0328d43d9b70c2e8dd512512091a2

void test_findSimilarStr() {
constexpr Arr<StringRef, 8> CANDIDATES{"if", "ifdef", "ifndef",
"elif", "else", "endif",
"elifdef", "elifndef"};

assert(findSimilarStr("id", CANDIDATES) == "if"sv);
assert(findSimilarStr("ifd", CANDIDATES) == "if"sv);
assert(findSimilarStr("ifde", CANDIDATES) == "ifdef"sv);
assert(findSimilarStr("elf", CANDIDATES) == "elif"sv);
assert(findSimilarStr("elsif", CANDIDATES) == "elif"sv);
assert(findSimilarStr("elseif", CANDIDATES) == "elif"sv);
assert(findSimilarStr("elfidef", CANDIDATES) == "elifdef"sv);
assert(findSimilarStr("elfindef", CANDIDATES) == "elifdef"sv);
assert(findSimilarStr("elfinndef", CANDIDATES) == "elifndef"sv);
assert(findSimilarStr("els", CANDIDATES) == "else"sv);
assert(findSimilarStr("endi", CANDIDATES) == "endif"sv);

assert(findSimilarStr("i", CANDIDATES) == None);
assert(findSimilarStr("special_compiler_directive", CANDIDATES) == None);
}

void test_findSimilarStr2() {
constexpr Arr<StringRef, 2> CANDIDATES{"aaab", "aaabc"};
assert(findSimilarStr("aaaa", CANDIDATES) == "aaab"sv);
assert(findSimilarStr("1111111111", CANDIDATES) == None);

constexpr Arr<StringRef, 1> CANDIDATES2{"AAAA"};
assert(findSimilarStr("aaaa", CANDIDATES2) == "AAAA"sv);
}

int main() {
test_levDistance();
test_levDistance2();
test_findSimilarStr();
test_findSimilarStr2();
}

#endif
13 changes: 13 additions & 0 deletions src/Algos.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <iostream>
#include <queue>
#include <span>
#include <stdexcept>

template <typename T>
Expand Down Expand Up @@ -57,3 +58,15 @@ Vec<String> topoSort(
}
return res;
}

// ref: https://reviews.llvm.org/differential/changeset/?ref=3315514
/// Find a similar string in `candidates`.
///
/// \param lhs a string for a similar string in `Candidates`
///
/// \param candidates the candidates to find a similar string.
///
/// \returns a similar string if exists. If no similar string exists,
/// returns None.
Option<StringRef>
findSimilarStr(StringRef lhs, std::span<const StringRef> candidates);
23 changes: 23 additions & 0 deletions src/Rustify.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <filesystem>
#include <functional>
#include <map>
#include <optional>
#include <set>
#include <string>
#include <string_view>
Expand All @@ -13,6 +14,8 @@
#include <vector>

namespace fs = std::filesystem;
using namespace std::literals::string_literals;
using namespace std::literals::string_view_literals;

// NOLINTBEGIN(readability-identifier-naming)
using u8 = std::uint8_t;
Expand Down Expand Up @@ -52,3 +55,23 @@ using HashSet = std::unordered_set<K>;

template <typename T>
using Fn = std::function<T>;

template <typename T>
using Option = std::optional<T>;

struct NoneT : protected std::monostate {
constexpr auto operator==(const usize rhs) const -> bool {
return String::npos == rhs;
}

// NOLINTNEXTLINE(google-explicit-constructor)
constexpr operator std::nullopt_t() const {
return std::nullopt;
}

template <typename T>
constexpr operator Option<T>() const { // NOLINT(google-explicit-constructor)
return std::nullopt;
}
};
inline constexpr NoneT None; // NOLINT(readability-identifier-naming)

0 comments on commit 4fa681c

Please sign in to comment.