diff --git a/CMakeLists.txt b/CMakeLists.txt index 8a12975..181209d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.11) project( hammingdist - VERSION 0.16.0 + VERSION 0.17.0 LANGUAGES CXX) include(CTest) diff --git a/include/hamming/hamming.hh b/include/hamming/hamming.hh index 6a4d592..f83d4ff 100644 --- a/include/hamming/hamming.hh +++ b/include/hamming/hamming.hh @@ -19,6 +19,8 @@ std::vector fasta_reference_distances(const std::string &reference_sequence, const std::string &fasta_file, bool include_x = false); +std::vector fasta_sequence_indices(const std::string &fasta_file, + std::size_t n = 0); } // namespace hamming diff --git a/include/hamming/hamming_types.hh b/include/hamming/hamming_types.hh index d9b538b..cb68f7e 100644 --- a/include/hamming/hamming_types.hh +++ b/include/hamming/hamming_types.hh @@ -8,7 +8,7 @@ namespace hamming { -using DistIntType = uint16_t; +using DistIntType = uint8_t; using ReferenceDistIntType = uint32_t; struct DataSet { diff --git a/python/hammingdist.cc b/python/hammingdist.cc index 2345a26..c49276c 100644 --- a/python/hammingdist.cc +++ b/python/hammingdist.cc @@ -65,6 +65,16 @@ PYBIND11_MODULE(hammingdist, m) { py::arg("include_x") = false, "Calculates the distance of each sequence in the fasta file from the " "supplied reference sequence"); + m.def( + "fasta_sequence_indices", + [](const std::string &fasta_file, std::size_t n) { + return as_pyarray(fasta_sequence_indices(fasta_file, n)); + }, + py::arg("fasta_file"), py::arg("n") = 0, + "Returns the same output as dump_sequence_indices() but without " + "constructing the distances matrix." + "For each genome in the input fasta file it gives the index of the " + "corresponding row in the distances matrix which excludes duplicates"); } } // namespace hamming diff --git a/python/tests/test_hammingdist.py b/python/tests/test_hammingdist.py index f8784fb..932fc32 100644 --- a/python/tests/test_hammingdist.py +++ b/python/tests/test_hammingdist.py @@ -10,7 +10,7 @@ def write_fasta_file(filename, sequences): f.write(f">seq{i}\n{seq}\n") -def check_output_sizes(dat, n_in, n_out, tmp_out_file): +def check_output_sizes(dat, n_in, n_out, tmp_out_file, fasta_sequence_indices=None): dat.dump(tmp_out_file) dump = np.loadtxt(tmp_out_file, delimiter=",") assert len(dump) == n_out @@ -28,6 +28,8 @@ def check_output_sizes(dat, n_in, n_out, tmp_out_file): indices = np.loadtxt(tmp_out_file, delimiter=",") assert len(indices) == n_in assert indices[0] == 0 + if fasta_sequence_indices is not None: + assert np.allclose(indices, fasta_sequence_indices) def test_from_fasta(tmp_path): @@ -52,22 +54,27 @@ def test_from_fasta(tmp_path): data = hammingdist.from_fasta(fasta_file, include_x=True) check_output_sizes(data, 6, 6, output_file) + fasta_sequence_indices = hammingdist.fasta_sequence_indices(fasta_file) data = hammingdist.from_fasta(fasta_file, remove_duplicates=True) - check_output_sizes(data, 6, 4, output_file) + check_output_sizes(data, 6, 4, output_file, fasta_sequence_indices) + fasta_sequence_indices = hammingdist.fasta_sequence_indices(fasta_file) data = hammingdist.from_fasta(fasta_file, remove_duplicates=True, include_x=True) - check_output_sizes(data, 6, 4, output_file) + check_output_sizes(data, 6, 4, output_file, fasta_sequence_indices) + fasta_sequence_indices = hammingdist.fasta_sequence_indices(fasta_file, n=2) data = hammingdist.from_fasta(fasta_file, include_x=True, n=2) - check_output_sizes(data, 2, 2, output_file) + check_output_sizes(data, 2, 2, output_file, fasta_sequence_indices) + fasta_sequence_indices = hammingdist.fasta_sequence_indices(fasta_file, n=3) data = hammingdist.from_fasta(fasta_file, remove_duplicates=True, n=3) - check_output_sizes(data, 3, 2, output_file) + check_output_sizes(data, 3, 2, output_file, fasta_sequence_indices) + fasta_sequence_indices = hammingdist.fasta_sequence_indices(fasta_file, n=5) data = hammingdist.from_fasta( fasta_file, remove_duplicates=True, n=5, include_x=True ) - check_output_sizes(data, 5, 3, output_file) + check_output_sizes(data, 5, 3, output_file, fasta_sequence_indices) @pytest.mark.parametrize( diff --git a/setup.py b/setup.py index c1ad377..97d823b 100644 --- a/setup.py +++ b/setup.py @@ -93,7 +93,7 @@ def build_extension(self, ext): setup( name="hammingdist", - version="0.16.0", + version="0.17.0", author="Dominic Kempf, Liam Keegan", author_email="ssc@iwr.uni-heidelberg.de", description="A fast tool to calculate Hamming distances", @@ -114,6 +114,7 @@ def build_extension(self, ext): "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", "License :: OSI Approved :: MIT License", diff --git a/src/hamming.cc b/src/hamming.cc index 0e94238..f1933ea 100644 --- a/src/hamming.cc +++ b/src/hamming.cc @@ -186,6 +186,34 @@ DataSet from_fasta(const std::string &filename, bool include_x, return DataSet(data, include_x, true, std::move(sequence_indices)); } +std::vector fasta_sequence_indices(const std::string &fasta_file, + std::size_t n) { + std::vector sequence_indices{}; + std::unordered_map map_seq_to_index; + std::ifstream stream(fasta_file); + if (n == 0) { + n = std::numeric_limits::max(); + } + std::size_t count = 0; + std::size_t count_unique = 0; + std::string line; + // skip first header + std::getline(stream, line); + while (count < n && !stream.eof()) { + std::string seq{}; + while (std::getline(stream, line) && line[0] != '>') { + seq.append(line); + } + auto result = map_seq_to_index.emplace(std::move(seq), count_unique); + if (result.second) { + ++count_unique; + } + sequence_indices.push_back(result.first->second); + ++count; + } + return sequence_indices; +} + std::vector fasta_reference_distances(const std::string &reference_sequence, const std::string &fasta_file, bool include_x) {