Skip to content

Commit

Permalink
Merge pull request #18 from ssciwr/fix_11_lower_triangular
Browse files Browse the repository at this point in the history
add lower triangular matrix input/output
  • Loading branch information
dokempf authored Jun 23, 2021
2 parents 1f7ae99 + 5f90451 commit a70ed24
Show file tree
Hide file tree
Showing 8 changed files with 82 additions and 3 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,14 @@ data = hammingdist.from_fasta("example.fasta", 100)
# The distance data can be accessed point-wise, though looping over all distances might be quite inefficient
print(data[14,42])
# The data can be written to disk and retrieved:
# The data can be written to disk in csv format (default `distance` Ripser format) and retrieved:
data.dump("backup.csv")
retrieval = hammingdist.from_csv("backup.csv")
# It can also be written in lower triangular format (comma-delimited row-major, `lower-distance` Ripser format):
data.dump_lower_triangular("lt.txt")
retrieval = hammingdist.from_lower_triangular("lt.txt")
# Finally, we can pass the data as a list of strings in Python:
data = hammingdist.from_stringlist(["ACGTACGT", "ACGTAGGT", "ATTTACGT"])
Expand Down
1 change: 1 addition & 0 deletions include/hamming/hamming.hh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ namespace hamming {
DataSet from_stringlist(std::vector<std::string>&);
DataSet from_csv(const std::string&);
DataSet from_fasta(const std::string&, std::size_t n = 0);
DataSet from_lower_triangular(const std::string&);

}

Expand Down
2 changes: 2 additions & 0 deletions include/hamming/hamming_types.hh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ struct DataSet
{
DataSet(std::vector<std::string>&, bool clear_input_data = false);
DataSet(const std::string&);
DataSet(std::vector<DistIntType>&& distances);
void dump(const std::string&);
void dump_lower_triangular(const std::string&);
int operator[](const std::array<std::size_t, 2>&) const;

std::size_t nsamples;
Expand Down
4 changes: 3 additions & 1 deletion python/hammingdist.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@ PYBIND11_MODULE(hammingdist,m)
m.doc() = "Small tool to calculate Hamming distances between gene sequences";

py::class_<DataSet>(m, "DataSet")
.def("dump", &DataSet::dump)
.def("dump", &DataSet::dump, "Dump distances matrix in csv format")
.def("dump_lower_triangular", &DataSet::dump_lower_triangular, "Dump distances matrix in lower triangular format (comma-delimited, row-major)")
.def("__getitem__", &DataSet::operator[])
.def_readonly("_distances", &DataSet::result);

m.def("from_stringlist", &from_stringlist, "Creates a dataset from a list of strings");
m.def("from_csv", &from_csv, "Creates a dataset by reading already computed distances from csv (full matrix expected)");
m.def("from_fasta", &from_fasta, py::arg("filename"), py::arg("n") = 0, "Creates a dataset by reading from a fasta file (assuming all sequences have equal length)");
m.def("from_lower_triangular", &from_lower_triangular, "Creates a dataset by reading already computed distances from lower triangular format");
}

}
43 changes: 43 additions & 0 deletions src/hamming.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include<array>
#include<algorithm>
#include<cmath>
#include<fstream>
#include<iostream>
#include<limits>
Expand Down Expand Up @@ -45,6 +46,16 @@ DataSet::DataSet(const std::string& filename)
}
}

static std::size_t uint_sqrt(std::size_t x){
return static_cast<std::size_t>(std::round(std::sqrt(x)));
}

DataSet::DataSet(std::vector<DistIntType>&& distances) : result{std::move(distances)}
{
// infer n from number of lower triangular matrix elements = n(n-1)/2
nsamples = (uint_sqrt(8*result.size()+1)+1)/2;
}

void DataSet::dump(const std::string& filename)
{
std::ofstream stream(filename);
Expand All @@ -60,6 +71,20 @@ void DataSet::dump(const std::string& filename)
}
}

void DataSet::dump_lower_triangular(const std::string& filename)
{
std::ofstream stream(filename);
std::size_t k = 0;
for(std::size_t i=1; i<nsamples; ++i)
{
for(std::size_t j=0; j+1<i; ++j)
{
stream << static_cast<int>(result[k++]) << ",";
}
stream << static_cast<int>(result[k++]) << "\n";
}
}

int DataSet::operator[](const std::array<std::size_t, 2>& index) const
{
auto i = index[0];
Expand All @@ -81,6 +106,24 @@ DataSet from_csv(const std::string& filename)
return DataSet(filename);
}

DataSet from_lower_triangular(const std::string& filename)
{
std::vector<DistIntType> distances;
std::ifstream stream(filename);
std::string line;
while(std::getline(stream, line))
{
std::istringstream s(line);
std::string d;
while(s.good())
{
std::getline(s, d, ',');
distances.push_back(safe_int_cast(std::stoi(d)));
}
}
return DataSet(std::move(distances));
}

DataSet from_fasta(const std::string& filename, std::size_t n)
{
std::vector<std::string> data;
Expand Down
2 changes: 1 addition & 1 deletion src/hamming_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ std::array<GeneBlock, 256> lookupTable()
return lookup;
}

static DistIntType safe_int_cast(int x){
DistIntType safe_int_cast(int x){
if(x > std::numeric_limits<DistIntType>::max()){
throw std::runtime_error("Error: Distance is too large for chosen integer type");
}
Expand Down
2 changes: 2 additions & 0 deletions src/hamming_impl.hh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ namespace hamming {

std::array<GeneBlock, 256> lookupTable();

DistIntType safe_int_cast(int x);

std::vector<DistIntType> distances(std::vector<std::string>& data, bool clear_input_data);

int distance_sparse(const SparseData& a, const SparseData& b);
Expand Down
25 changes: 25 additions & 0 deletions src/hamming_t.cc
Original file line number Diff line number Diff line change
Expand Up @@ -193,3 +193,28 @@ TEST_CASE("throws on distance integer overflow", "[hamming]") {
std::string msg{"Error: Distance is too large for chosen integer type"};
REQUIRE_THROWS_WITH(DataSet(data), msg);
}

TEST_CASE("from_lower_triangular reproduces correct data", "[hamming][Q]") {
std::mt19937 gen(12345);
char tmp_file_name[L_tmpnam];
REQUIRE(std::tmpnam(tmp_file_name) != nullptr);
for (auto n : {1, 2, 3, 5, 9, 10, 19, 100, 207}) {
CAPTURE(n);
std::vector<std::string> data(n);
for(auto& d : data)
d = make_test_string(24, gen);

DataSet ref(data);
REQUIRE(ref.nsamples == n);
ref.dump_lower_triangular(std::string(tmp_file_name));

auto restore = from_lower_triangular(std::string(tmp_file_name));
REQUIRE(ref.nsamples == restore.nsamples);
for(std::size_t i=0; i<ref.nsamples; ++i) {
for(std::size_t j=0; j<ref.nsamples; ++j) {
REQUIRE(ref[{i, j}] == restore[{i, j}]);
}
}
}
std::remove(tmp_file_name);
}

0 comments on commit a70ed24

Please sign in to comment.