From 6d23cdd0f9125b7ff75cb02c159f0482e51d5a6c Mon Sep 17 00:00:00 2001 From: Liam Keegan Date: Thu, 2 Dec 2021 15:13:16 +0100 Subject: [PATCH] add distance function - resolves #44 --- README.md | 12 ++++++++++- include/hamming/hamming.hh | 2 ++ python/hammingdist.cc | 3 +++ python/tests/test_hammingdist.py | 14 +++++++++++++ src/hamming.cc | 22 +++++++++++++++++++++ src/hamming_t.cc | 34 ++++++++++++++++++++++---------- 6 files changed, 76 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 268aee8..037855d 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,8 @@ To use the Python interface, you should install it from PyPI: python -m pip install hammingdist ``` +## Distances matrix + Then, you can e.g. use it in the following way from Python: ```python @@ -63,9 +65,17 @@ distances = hammingdist.fasta_reference_distances(sequence, fasta_file, include_ This function returns a numpy array that contains the distance of each sequence from the reference sequence. +You can also calculate the distance between two individual sequences: + +```python +import hammingdist + +distance = hammingdist.distance("ACGTX", "AAGTX", include_x=True) +``` + ## OpenMP on linux -The latest version of hammingdist on linux is now built with OpenMP (multithreading) support. +The latest versions of hammingdist on linux are now built with OpenMP (multithreading) support. If this causes any issues, you can install a previous version of hammingdist without OpenMP support: ```bash diff --git a/include/hamming/hamming.hh b/include/hamming/hamming.hh index 5b47979..6a4d592 100644 --- a/include/hamming/hamming.hh +++ b/include/hamming/hamming.hh @@ -13,6 +13,8 @@ DataSet from_fasta(const std::string &, bool include_x = false, bool remove_duplicates = false, std::size_t n = 0); DataSet from_lower_triangular(const std::string &); +ReferenceDistIntType distance(const std::string &seq0, const std::string &seq1, + bool include_x = false); std::vector fasta_reference_distances(const std::string &reference_sequence, const std::string &fasta_file, diff --git a/python/hammingdist.cc b/python/hammingdist.cc index 6d58ac3..2345a26 100644 --- a/python/hammingdist.cc +++ b/python/hammingdist.cc @@ -51,6 +51,9 @@ PYBIND11_MODULE(hammingdist, m) { m.def("from_lower_triangular", &from_lower_triangular, "Creates a dataset by reading already computed distances from lower " "triangular format"); + m.def("distance", &distance, py::arg("seq0"), py::arg("seq1"), + py::arg("include_x") = false, + "Calculate the distance between seq0 and seq1"); m.def( "fasta_reference_distances", [](const std::string &reference_sequence, const std::string &fasta_file, diff --git a/python/tests/test_hammingdist.py b/python/tests/test_hammingdist.py index 709a006..f8784fb 100644 --- a/python/tests/test_hammingdist.py +++ b/python/tests/test_hammingdist.py @@ -101,3 +101,17 @@ def test_fasta_reference_distances(chars, include_x, tmp_path): # if x is not included, invalid chars have distance 1 but data[i,i] returns 0 by construction if include_x or i != j: assert data[i, j] == dist + # should also agree with output of distance function for these two sequences + assert dist == hammingdist.distance( + sequences[i], sequences[j], include_x=include_x + ) + + +def test_distance(): + assert hammingdist.distance("ACGT", "ACCT") == 1 + # here X is invalid so has distance 1 from itself: + assert hammingdist.distance("ACGTX", "ACCTX") == 2 + # now X is valid so has distance 0 from itself: + assert hammingdist.distance("ACGTX", "ACCTX", include_x=True) == 1 + with pytest.raises(RuntimeError): + hammingdist.distance("ACGT", "ACC") diff --git a/src/hamming.cc b/src/hamming.cc index a85fb7c..0b837ce 100644 --- a/src/hamming.cc +++ b/src/hamming.cc @@ -204,4 +204,26 @@ fasta_reference_distances(const std::string &reference_sequence, return distances; } +ReferenceDistIntType distance(const std::string &seq0, const std::string &seq1, + bool include_x) { + auto lookup{lookupTable(include_x)}; + ReferenceDistIntType distance{0}; + if (seq0.size() != seq1.size()) { + throw std::runtime_error( + "Error: Sequences do not all have the same length"); + } + for (std::size_t i = 0; i < seq0.size(); ++i) { + auto a{lookup[seq0[i]]}; + auto b{lookup[seq1[i]]}; + bool invalid{(a & b) == 0x00}; + bool differ{(seq0[i] != seq1[i])}; + if (invalid || differ) { + bool nodash{(a != 0xff) && (b != 0xff)}; + distance += + static_cast(invalid || (differ && nodash)); + } + } + return distance; +} + } // namespace hamming diff --git a/src/hamming_t.cc b/src/hamming_t.cc index 87ff2e7..5246a91 100644 --- a/src/hamming_t.cc +++ b/src/hamming_t.cc @@ -2,6 +2,7 @@ #include "tests.hh" #include #include +#include using namespace hamming; @@ -9,23 +10,30 @@ static constexpr std::array valid_chars{'A', 'C', 'G', 'T'}; static constexpr std::array invalid_chars{' ', 'N', '*', '?', 'a', '.'}; -static int dist(char c1, char c2) { +static int dist1(char c1, char c2) { std::vector v{std::string{c1}, std::string{c2}}; return from_stringlist(v)[{0, 1}]; } +static int dist2(char c1, char c2) { + return distance(std::string{c1}, std::string{c2}); +} + TEST_CASE("distance between two equal valid characters is 0", "[distance]") { for (auto c : valid_chars) { CAPTURE(c); - REQUIRE(dist(c, c) == 0); + REQUIRE(dist1(c, c) == 0); + REQUIRE(dist2(c, c) == 0); } } TEST_CASE("distance between any valid char & '-' is 0", "[distance]") { for (auto c : valid_chars) { CAPTURE(c); - REQUIRE(dist(c, '-') == 0); - REQUIRE(dist('-', c) == 0); + REQUIRE(dist1(c, '-') == 0); + REQUIRE(dist1('-', c) == 0); + REQUIRE(dist2(c, '-') == 0); + REQUIRE(dist2('-', c) == 0); } } @@ -36,7 +44,8 @@ TEST_CASE("distance between two different valid characters is 1", if (c1 != c2) { CAPTURE(c1); CAPTURE(c2); - REQUIRE(dist(c1, c2) == 1); + REQUIRE(dist1(c1, c2) == 1); + REQUIRE(dist2(c1, c2) == 1); } } } @@ -48,7 +57,8 @@ TEST_CASE("distance between valid & invalid characters is 1", "[distance]") { CAPTURE(c1); CAPTURE(c2); CAPTURE((int)c2); - REQUIRE(dist(c1, c2) == 1); + REQUIRE(dist1(c1, c2) == 1); + REQUIRE(dist2(c1, c2) == 1); } } } @@ -58,7 +68,8 @@ TEST_CASE("distance between two invalid characters is 1", "[distance]") { for (auto c2 : invalid_chars) { CAPTURE(c1); CAPTURE(c2); - REQUIRE(dist(c1, c2) == 1); + REQUIRE(dist1(c1, c2) == 1); + REQUIRE(dist2(c1, c2) == 1); } } } @@ -66,8 +77,10 @@ TEST_CASE("distance between two invalid characters is 1", "[distance]") { TEST_CASE("distance between any invalid char & '-' is 1", "[distance]") { for (auto c : invalid_chars) { CAPTURE(c); - REQUIRE(dist(c, '-') == 1); - REQUIRE(dist('-', c) == 1); + REQUIRE(dist1(c, '-') == 1); + REQUIRE(dist1('-', c) == 1); + REQUIRE(dist2(c, '-') == 1); + REQUIRE(dist2('-', c) == 1); } } @@ -211,12 +224,13 @@ TEST_CASE("invalid input data: inconsistent sequence lengths", "[invalid]") { std::vector> expr; expr.push_back({{"ACGT"}, {"A"}}); expr.push_back({{"AC"}, {"ACGTCG"}}); - expr.push_back({{"ACGT"}, {"ACGT"}, {"ACGT"}, {"A"}, {"ACGT"}, {"ACGT"}}); + expr.push_back({{"ACGT"}, {"ACGT"}, {"ACGT"}, {"ACGT"}, {"ACGT"}, {"A"}}); expr.push_back({{"ACGT"}, {"A-----"}}); expr.push_back({{"ACGT"}, {""}}); std::string msg{"Error: Sequences do not all have the same length"}; for (auto &v : expr) { REQUIRE_THROWS_WITH(from_stringlist(v), msg); + REQUIRE_THROWS_WITH(distance(v.front(), v.back()), msg); } }