Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX: 1) bag in pattern algorithm for hamming distances 2) speeding up… #2

Merged
merged 1 commit into from
Jan 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
#endif

// hamAdjacencyMatSparse
arma::sp_umat hamAdjacencyMatSparse(std::vector<std::string> strings, const int& maxdist, bool drop_deg_zero, std::string tempfile);
arma::sp_umat hamAdjacencyMatSparse(std::vector<std::string>& strings, const int& maxdist, bool drop_deg_zero, std::string tempfile);
RcppExport SEXP _NAIR_hamAdjacencyMatSparse(SEXP stringsSEXP, SEXP maxdistSEXP, SEXP drop_deg_zeroSEXP, SEXP tempfileSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::vector<std::string> >::type strings(stringsSEXP);
Rcpp::traits::input_parameter< std::vector<std::string>& >::type strings(stringsSEXP);
Rcpp::traits::input_parameter< const int& >::type maxdist(maxdistSEXP);
Rcpp::traits::input_parameter< bool >::type drop_deg_zero(drop_deg_zeroSEXP);
Rcpp::traits::input_parameter< std::string >::type tempfile(tempfileSEXP);
Expand All @@ -26,25 +26,25 @@ BEGIN_RCPP
END_RCPP
}
// hamDistBounded
int hamDistBounded(std::string a, std::string b, const int& k);
int hamDistBounded(const std::string& a, const std::string& b, const int& k);
RcppExport SEXP _NAIR_hamDistBounded(SEXP aSEXP, SEXP bSEXP, SEXP kSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type a(aSEXP);
Rcpp::traits::input_parameter< std::string >::type b(bSEXP);
Rcpp::traits::input_parameter< const std::string& >::type a(aSEXP);
Rcpp::traits::input_parameter< const std::string& >::type b(bSEXP);
Rcpp::traits::input_parameter< const int& >::type k(kSEXP);
rcpp_result_gen = Rcpp::wrap(hamDistBounded(a, b, k));
return rcpp_result_gen;
END_RCPP
}
// levAdjacencyMatSparse
arma::sp_umat levAdjacencyMatSparse(std::vector<std::string> strings, const int& maxdist, bool drop_deg_zero, std::string tempfile);
arma::sp_umat levAdjacencyMatSparse(std::vector<std::string>& strings, const int& maxdist, bool drop_deg_zero, std::string tempfile);
RcppExport SEXP _NAIR_levAdjacencyMatSparse(SEXP stringsSEXP, SEXP maxdistSEXP, SEXP drop_deg_zeroSEXP, SEXP tempfileSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::vector<std::string> >::type strings(stringsSEXP);
Rcpp::traits::input_parameter< std::vector<std::string>& >::type strings(stringsSEXP);
Rcpp::traits::input_parameter< const int& >::type maxdist(maxdistSEXP);
Rcpp::traits::input_parameter< bool >::type drop_deg_zero(drop_deg_zeroSEXP);
Rcpp::traits::input_parameter< std::string >::type tempfile(tempfileSEXP);
Expand Down
Binary file modified src/RcppExports.o
Binary file not shown.
28 changes: 17 additions & 11 deletions src/hamAdjacencyMatSparse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,29 +24,35 @@ using namespace arma;

// [[Rcpp::export(".hamAdjacencyMatSparse")]]
arma::sp_umat hamAdjacencyMatSparse(
std::vector<std::string> strings,
std::vector<std::string> &strings,
const int& maxdist,
bool drop_deg_zero,
std::string tempfile
) {

// allocate memory for data structures
std::unordered_map<std::string, std::vector<int>> str2idx; // keep map from unique strings to indices
const int dim = strings.size();
int dist;
arma::sp_umat out = speye<sp_umat>(dim, dim); // initialize as identity mat

// compute adjacencies for upper triangle
for (int j = 0; j < dim; ++j) { // columns
for (int i = 0; i < j; ++i) { // rows
dist = hamDistBounded(strings[i], strings[j], maxdist);
if (dist != -1) { out(i, j) = 1; }
Rcpp::checkUserInterrupt();
for (int i = 0; i < dim; i++)
str2idx[strings[i]].push_back(i);

// compute adjacencies
for (auto it1 = str2idx.begin(); it1 != str2idx.end(); ++it1) {
for (auto it2 = str2idx.begin(); it2 != it1; ++it2) {
dist = hamDistBounded(it1->first, it2->first, maxdist);
if (dist != -1)
for (auto idx1 : it1->second)
for (auto idx2 : it2->second)
out(idx1, idx2) = out(idx2, idx1) = 1;
}
for (auto idx1 : it1->second)
for (auto idx2 : it1->second)
out(idx1, idx2) = out(idx2, idx1) = 1;
Rcpp::checkUserInterrupt();
}

// reflect upper triangle to lower
out = arma::symmatu(out);

if (drop_deg_zero) {

// sum entries columnwise
Expand Down
Binary file modified src/hamAdjacencyMatSparse.o
Binary file not shown.
11 changes: 7 additions & 4 deletions src/hamDistBounded.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
using namespace Rcpp;

// [[Rcpp::export]]
int hamDistBounded(std::string a,
std::string b,
int hamDistBounded(const std::string &a,
const std::string &b,
const int& k) {

if (k < 0) { return(-1); } // trivial bound
Expand All @@ -41,8 +41,11 @@ int hamDistBounded(std::string a,
// Compute hamming distance; longer string truncated to length of shorter
int ind_bound = std::min(n, m);
for (int i = 0; i < ind_bound; ++i) {
if (a[i] != b[i]) { dist++; }
if (dist > k) { return(-1); } // stop if distance exceeds bound
if (a[i] != b[i]) {
dist++;
if (dist > k)
return(-1); // stop if distance exceeds bound
}
}

// return distance
Expand Down
4 changes: 2 additions & 2 deletions src/hamDistBounded.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#ifndef LEVADJ_SRC_HAMDISTBOUNDED_H_
#define LEVADJ_SRC_HAMDISTBOUNDED_H_

int hamDistBounded(std::string a,
std::string b,
int hamDistBounded(const std::string &a,
const std::string &b,
const int& k) ;

#endif /* LEVADJ_SRC_HAMDISTBOUNDED_H_ */
Binary file modified src/hamDistBounded.o
Binary file not shown.
31 changes: 18 additions & 13 deletions src/levAdjacencyMatSparse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,31 +22,37 @@ using namespace Rcpp;
using namespace arma;
// [[Rcpp::depends(RcppArmadillo)]]

// [[Rcpp::export(".levAdjacencyMatSparse")]]
// [[Rcpp::export(.levAdjacencyMatSparse)]]
arma::sp_umat levAdjacencyMatSparse(
std::vector<std::string> strings,
std::vector<std::string> &strings,
const int& maxdist,
bool drop_deg_zero,
std::string tempfile
) {

// allocate memory for data structures
std::unordered_map<std::string, std::vector<int>> str2idx; // keep map from unique strings to indices
const int dim = strings.size();
int dist;
arma::sp_umat out = speye<sp_umat>(dim, dim); // initialize as identity mat

// compute adjacencies for upper triangle
for (int j = 0; j < dim; ++j) { // columns
for (int i = 0; i < j; ++i) { // rows
dist = levDistBounded(strings[i], strings[j], maxdist);
if (dist != -1) { out(i, j) = 1; }
Rcpp::checkUserInterrupt();
for (int i = 0; i < dim; i++)
str2idx[strings[i]].push_back(i);

// compute adjacencies
for (auto it1 = str2idx.begin(); it1 != str2idx.end(); ++it1) {
for (auto it2 = str2idx.begin(); it2 != it1; ++it2) {
dist = levDistBounded(it1->first, it2->first, maxdist);
if (dist != -1)
for (auto idx1 : it1->second)
for (auto idx2 : it2->second)
out(idx1, idx2) = out(idx2, idx1) = 1;
}
for (auto idx1 : it1->second)
for (auto idx2 : it1->second)
out(idx1, idx2) = out(idx2, idx1) = 1;
Rcpp::checkUserInterrupt();
}

// reflect upper triangle to lower
out = arma::symmatu(out);

if (drop_deg_zero) {

// sum entries columnwise
Expand All @@ -66,7 +72,6 @@ arma::sp_umat levAdjacencyMatSparse(
col_ids.save(tempfile, raw_ascii);

}

return(out);

}
Expand Down
Binary file modified src/levAdjacencyMatSparse.o
Binary file not shown.
8 changes: 4 additions & 4 deletions src/levDistBounded.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@ int levDistBounded(std::string a,
b.erase(0, 1);
}
// Strip common suffix
while (!a.empty() && !b.empty() && a.back() == b.back()) {
a.pop_back();
b.pop_back();
}
int bound = std::min(n, m), start;
for (start = 0; start < bound && a[start] == b[start]; ++start);
a = a.substr(start);
b = b.substr(start);
// Use shorter string for column dimension to save memory
if (b.length() > a.length()) { a.swap(b); } // ensures b is not longer than a

Expand Down
Binary file modified src/levDistBounded.o
Binary file not shown.
4 changes: 4 additions & 0 deletions src/patternAdjacencyMatSparse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ inline std::unordered_set<std::string> getHamming2Patterns(
pattern.push_back('_');
pattern.push_back('_');
patterns->insert(pattern);
pattern = str;
pattern[static_cast<int>(str.length()) - 1] = '_';
pattern.push_back('_');
patterns->insert(pattern);
getHamming1Patterns(str, patterns);
return *patterns;
}
Expand Down
Binary file modified src/patternAdjacencyMatSparse.o
Binary file not shown.
Loading