Skip to content

Commit

Permalink
Merge pull request #74 from ssciwr/fix_72_sequence_indices
Browse files Browse the repository at this point in the history
add fasta_sequence_indices
  • Loading branch information
lkeegan authored Sep 5, 2022
2 parents f354bc5 + 0301f79 commit 0ea8ac9
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 9 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.11)

project(
hammingdist
VERSION 0.16.0
VERSION 0.17.0
LANGUAGES CXX)

include(CTest)
Expand Down
2 changes: 2 additions & 0 deletions include/hamming/hamming.hh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ std::vector<ReferenceDistIntType>
fasta_reference_distances(const std::string &reference_sequence,
const std::string &fasta_file,
bool include_x = false);
std::vector<std::size_t> fasta_sequence_indices(const std::string &fasta_file,
std::size_t n = 0);

} // namespace hamming

Expand Down
2 changes: 1 addition & 1 deletion include/hamming/hamming_types.hh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

namespace hamming {

using DistIntType = uint16_t;
using DistIntType = uint8_t;
using ReferenceDistIntType = uint32_t;

struct DataSet {
Expand Down
10 changes: 10 additions & 0 deletions python/hammingdist.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,16 @@ PYBIND11_MODULE(hammingdist, m) {
py::arg("include_x") = false,
"Calculates the distance of each sequence in the fasta file from the "
"supplied reference sequence");
m.def(
"fasta_sequence_indices",
[](const std::string &fasta_file, std::size_t n) {
return as_pyarray(fasta_sequence_indices(fasta_file, n));
},
py::arg("fasta_file"), py::arg("n") = 0,
"Returns the same output as dump_sequence_indices() but without "
"constructing the distances matrix."
"For each genome in the input fasta file it gives the index of the "
"corresponding row in the distances matrix which excludes duplicates");
}

} // namespace hamming
19 changes: 13 additions & 6 deletions python/tests/test_hammingdist.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def write_fasta_file(filename, sequences):
f.write(f">seq{i}\n{seq}\n")


def check_output_sizes(dat, n_in, n_out, tmp_out_file):
def check_output_sizes(dat, n_in, n_out, tmp_out_file, fasta_sequence_indices=None):
dat.dump(tmp_out_file)
dump = np.loadtxt(tmp_out_file, delimiter=",")
assert len(dump) == n_out
Expand All @@ -28,6 +28,8 @@ def check_output_sizes(dat, n_in, n_out, tmp_out_file):
indices = np.loadtxt(tmp_out_file, delimiter=",")
assert len(indices) == n_in
assert indices[0] == 0
if fasta_sequence_indices is not None:
assert np.allclose(indices, fasta_sequence_indices)


def test_from_fasta(tmp_path):
Expand All @@ -52,22 +54,27 @@ def test_from_fasta(tmp_path):
data = hammingdist.from_fasta(fasta_file, include_x=True)
check_output_sizes(data, 6, 6, output_file)

fasta_sequence_indices = hammingdist.fasta_sequence_indices(fasta_file)
data = hammingdist.from_fasta(fasta_file, remove_duplicates=True)
check_output_sizes(data, 6, 4, output_file)
check_output_sizes(data, 6, 4, output_file, fasta_sequence_indices)

fasta_sequence_indices = hammingdist.fasta_sequence_indices(fasta_file)
data = hammingdist.from_fasta(fasta_file, remove_duplicates=True, include_x=True)
check_output_sizes(data, 6, 4, output_file)
check_output_sizes(data, 6, 4, output_file, fasta_sequence_indices)

fasta_sequence_indices = hammingdist.fasta_sequence_indices(fasta_file, n=2)
data = hammingdist.from_fasta(fasta_file, include_x=True, n=2)
check_output_sizes(data, 2, 2, output_file)
check_output_sizes(data, 2, 2, output_file, fasta_sequence_indices)

fasta_sequence_indices = hammingdist.fasta_sequence_indices(fasta_file, n=3)
data = hammingdist.from_fasta(fasta_file, remove_duplicates=True, n=3)
check_output_sizes(data, 3, 2, output_file)
check_output_sizes(data, 3, 2, output_file, fasta_sequence_indices)

fasta_sequence_indices = hammingdist.fasta_sequence_indices(fasta_file, n=5)
data = hammingdist.from_fasta(
fasta_file, remove_duplicates=True, n=5, include_x=True
)
check_output_sizes(data, 5, 3, output_file)
check_output_sizes(data, 5, 3, output_file, fasta_sequence_indices)


@pytest.mark.parametrize(
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def build_extension(self, ext):

setup(
name="hammingdist",
version="0.16.0",
version="0.17.0",
author="Dominic Kempf, Liam Keegan",
author_email="[email protected]",
description="A fast tool to calculate Hamming distances",
Expand All @@ -114,6 +114,7 @@ def build_extension(self, ext):
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
"License :: OSI Approved :: MIT License",
Expand Down
28 changes: 28 additions & 0 deletions src/hamming.cc
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,34 @@ DataSet from_fasta(const std::string &filename, bool include_x,
return DataSet(data, include_x, true, std::move(sequence_indices));
}

std::vector<std::size_t> fasta_sequence_indices(const std::string &fasta_file,
std::size_t n) {
std::vector<std::size_t> sequence_indices{};
std::unordered_map<std::string, std::size_t> map_seq_to_index;
std::ifstream stream(fasta_file);
if (n == 0) {
n = std::numeric_limits<std::size_t>::max();
}
std::size_t count = 0;
std::size_t count_unique = 0;
std::string line;
// skip first header
std::getline(stream, line);
while (count < n && !stream.eof()) {
std::string seq{};
while (std::getline(stream, line) && line[0] != '>') {
seq.append(line);
}
auto result = map_seq_to_index.emplace(std::move(seq), count_unique);
if (result.second) {
++count_unique;
}
sequence_indices.push_back(result.first->second);
++count;
}
return sequence_indices;
}

std::vector<ReferenceDistIntType>
fasta_reference_distances(const std::string &reference_sequence,
const std::string &fasta_file, bool include_x) {
Expand Down

0 comments on commit 0ea8ac9

Please sign in to comment.