Skip to content

Commit

Permalink
Merge pull request #46 from ssciwr/fix_44_distance_function
Browse files Browse the repository at this point in the history
add distance function
  • Loading branch information
lkeegan authored Dec 3, 2021
2 parents ae40e39 + 6d23cdd commit 5dde93e
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 11 deletions.
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ To use the Python interface, you should install it from PyPI:
python -m pip install hammingdist
```

## Distances matrix

Then, you can e.g. use it in the following way from Python:

```python
Expand Down Expand Up @@ -63,9 +65,17 @@ distances = hammingdist.fasta_reference_distances(sequence, fasta_file, include_

This function returns a numpy array that contains the distance of each sequence from the reference sequence.

You can also calculate the distance between two individual sequences:

```python
import hammingdist

distance = hammingdist.distance("ACGTX", "AAGTX", include_x=True)
```

## OpenMP on linux

The latest version of hammingdist on linux is now built with OpenMP (multithreading) support.
The latest versions of hammingdist on linux are now built with OpenMP (multithreading) support.
If this causes any issues, you can install a previous version of hammingdist without OpenMP support:

```bash
Expand Down
2 changes: 2 additions & 0 deletions include/hamming/hamming.hh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ DataSet from_fasta(const std::string &, bool include_x = false,
bool remove_duplicates = false, std::size_t n = 0);
DataSet from_lower_triangular(const std::string &);

ReferenceDistIntType distance(const std::string &seq0, const std::string &seq1,
bool include_x = false);
std::vector<ReferenceDistIntType>
fasta_reference_distances(const std::string &reference_sequence,
const std::string &fasta_file,
Expand Down
3 changes: 3 additions & 0 deletions python/hammingdist.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ PYBIND11_MODULE(hammingdist, m) {
m.def("from_lower_triangular", &from_lower_triangular,
"Creates a dataset by reading already computed distances from lower "
"triangular format");
m.def("distance", &distance, py::arg("seq0"), py::arg("seq1"),
py::arg("include_x") = false,
"Calculate the distance between seq0 and seq1");
m.def(
"fasta_reference_distances",
[](const std::string &reference_sequence, const std::string &fasta_file,
Expand Down
14 changes: 14 additions & 0 deletions python/tests/test_hammingdist.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,17 @@ def test_fasta_reference_distances(chars, include_x, tmp_path):
# if x is not included, invalid chars have distance 1 but data[i,i] returns 0 by construction
if include_x or i != j:
assert data[i, j] == dist
# should also agree with output of distance function for these two sequences
assert dist == hammingdist.distance(
sequences[i], sequences[j], include_x=include_x
)


def test_distance():
assert hammingdist.distance("ACGT", "ACCT") == 1
# here X is invalid so has distance 1 from itself:
assert hammingdist.distance("ACGTX", "ACCTX") == 2
# now X is valid so has distance 0 from itself:
assert hammingdist.distance("ACGTX", "ACCTX", include_x=True) == 1
with pytest.raises(RuntimeError):
hammingdist.distance("ACGT", "ACC")
22 changes: 22 additions & 0 deletions src/hamming.cc
Original file line number Diff line number Diff line change
Expand Up @@ -204,4 +204,26 @@ fasta_reference_distances(const std::string &reference_sequence,
return distances;
}

ReferenceDistIntType distance(const std::string &seq0, const std::string &seq1,
bool include_x) {
auto lookup{lookupTable(include_x)};
ReferenceDistIntType distance{0};
if (seq0.size() != seq1.size()) {
throw std::runtime_error(
"Error: Sequences do not all have the same length");
}
for (std::size_t i = 0; i < seq0.size(); ++i) {
auto a{lookup[seq0[i]]};
auto b{lookup[seq1[i]]};
bool invalid{(a & b) == 0x00};
bool differ{(seq0[i] != seq1[i])};
if (invalid || differ) {
bool nodash{(a != 0xff) && (b != 0xff)};
distance +=
static_cast<ReferenceDistIntType>(invalid || (differ && nodash));
}
}
return distance;
}

} // namespace hamming
34 changes: 24 additions & 10 deletions src/hamming_t.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,38 @@
#include "tests.hh"
#include <cstdio>
#include <fstream>
#include <string>

using namespace hamming;

static constexpr std::array<char, 4> valid_chars{'A', 'C', 'G', 'T'};
static constexpr std::array<char, 6> invalid_chars{' ', 'N', '*',
'?', 'a', '.'};

static int dist(char c1, char c2) {
static int dist1(char c1, char c2) {
std::vector<std::string> v{std::string{c1}, std::string{c2}};
return from_stringlist(v)[{0, 1}];
}

static int dist2(char c1, char c2) {
return distance(std::string{c1}, std::string{c2});
}

TEST_CASE("distance between two equal valid characters is 0", "[distance]") {
for (auto c : valid_chars) {
CAPTURE(c);
REQUIRE(dist(c, c) == 0);
REQUIRE(dist1(c, c) == 0);
REQUIRE(dist2(c, c) == 0);
}
}

TEST_CASE("distance between any valid char & '-' is 0", "[distance]") {
for (auto c : valid_chars) {
CAPTURE(c);
REQUIRE(dist(c, '-') == 0);
REQUIRE(dist('-', c) == 0);
REQUIRE(dist1(c, '-') == 0);
REQUIRE(dist1('-', c) == 0);
REQUIRE(dist2(c, '-') == 0);
REQUIRE(dist2('-', c) == 0);
}
}

Expand All @@ -36,7 +44,8 @@ TEST_CASE("distance between two different valid characters is 1",
if (c1 != c2) {
CAPTURE(c1);
CAPTURE(c2);
REQUIRE(dist(c1, c2) == 1);
REQUIRE(dist1(c1, c2) == 1);
REQUIRE(dist2(c1, c2) == 1);
}
}
}
Expand All @@ -48,7 +57,8 @@ TEST_CASE("distance between valid & invalid characters is 1", "[distance]") {
CAPTURE(c1);
CAPTURE(c2);
CAPTURE((int)c2);
REQUIRE(dist(c1, c2) == 1);
REQUIRE(dist1(c1, c2) == 1);
REQUIRE(dist2(c1, c2) == 1);
}
}
}
Expand All @@ -58,16 +68,19 @@ TEST_CASE("distance between two invalid characters is 1", "[distance]") {
for (auto c2 : invalid_chars) {
CAPTURE(c1);
CAPTURE(c2);
REQUIRE(dist(c1, c2) == 1);
REQUIRE(dist1(c1, c2) == 1);
REQUIRE(dist2(c1, c2) == 1);
}
}
}

TEST_CASE("distance between any invalid char & '-' is 1", "[distance]") {
for (auto c : invalid_chars) {
CAPTURE(c);
REQUIRE(dist(c, '-') == 1);
REQUIRE(dist('-', c) == 1);
REQUIRE(dist1(c, '-') == 1);
REQUIRE(dist1('-', c) == 1);
REQUIRE(dist2(c, '-') == 1);
REQUIRE(dist2('-', c) == 1);
}
}

Expand Down Expand Up @@ -211,12 +224,13 @@ TEST_CASE("invalid input data: inconsistent sequence lengths", "[invalid]") {
std::vector<std::vector<std::string>> expr;
expr.push_back({{"ACGT"}, {"A"}});
expr.push_back({{"AC"}, {"ACGTCG"}});
expr.push_back({{"ACGT"}, {"ACGT"}, {"ACGT"}, {"A"}, {"ACGT"}, {"ACGT"}});
expr.push_back({{"ACGT"}, {"ACGT"}, {"ACGT"}, {"ACGT"}, {"ACGT"}, {"A"}});
expr.push_back({{"ACGT"}, {"A-----"}});
expr.push_back({{"ACGT"}, {""}});
std::string msg{"Error: Sequences do not all have the same length"};
for (auto &v : expr) {
REQUIRE_THROWS_WITH(from_stringlist(v), msg);
REQUIRE_THROWS_WITH(distance(v.front(), v.back()), msg);
}
}

Expand Down

0 comments on commit 5dde93e

Please sign in to comment.