diff --git a/.gitignore b/.gitignore index b66fee6..73978b9 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ *.RData *.Ruserdata *.bin -*.out \ No newline at end of file +*.out +.Rproj.user \ No newline at end of file diff --git a/README.md b/README.md index 0fbe69a..ec18b65 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ An optimised implementation of the tau statistic (relative prevalence ratio form * [References and credits](#References-and-credits) ## The statistic -I was evaluating the ['elevated prevalence' form](https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0155249.s003&type=supplementary#page=6 "Lessler et al. Appendix 5, p6") of the tau statistic [[Lessler et al]](#References-and-credits) as we had data on the underlying population (i.e. non-cases as well as cases) containing months of disease onset *ti* and UTM coordinates of their household (*xi*,*yi*). I optimised the implementation of the tau statistic from the development repo of the `IDSpatialStats::get.tau()` function, leading to a ~**52x speedup**. +I was evaluating the ['elevated prevalence' form](https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0155249.s003&type=supplementary#page=6 "Lessler et al. Appendix 5, p6") of the tau statistic [[Lessler et al]](#References-and-credits) as we had data on the underlying population (i.e. non-cases as well as cases) containing months of disease onset *ti* and UTM coordinates of their household (*xi*,*yi*). I optimised the implementation of the tau statistic from the development repo of the `IDSpatialStats::get.tau()` function, leading to a ~**76x speedup**. @@ -23,11 +23,11 @@ which is the proportion of related case pairs within a specified distance versus The relatedness of a case pair *zij*, is determined here using temporal information, if then *z_ij*=1 else 0. ## How the speedup was done -Rather than running `IDSpatialStats::get.tau()` function in an R script as described by `?get.tau()`, I isolated the C function responsible (`get_tau` on [line 403](https://github.com/HopkinsIDD/IDSpatialStats/blob/master/src/spatialfuncs.c#L403) from `spatialfuncs.c` source file for the [2782d6d](https://github.com/HopkinsIDD/IDSpatialStats/commit/2782d6dcc9ee4be9855b5e468ce789425b81d49a "Commit 2782d6d on 17 Dec 2018") commit), coded the four features summarised below, then ran it in an R script by sourcing it with `Rcpp::sourceCpp()` and then calling `getTau()` without needing `library(IDSpatialStats)` on [lines 6-10](https://github.com/t-pollington/tau-statistic-speedup/blob/master/run_get_tau.R#L6). I have provided both the R script file `run_get_tau.R` and the C file `get_tau.cpp` containing useful comments. +Rather than running `IDSpatialStats::get.tau()` function in an R script as described by `?get.tau()`, I isolated the C function responsible (`get_tau` on [line 403](https://github.com/HopkinsIDD/IDSpatialStats/blob/master/src/spatialfuncs.c#L403) from `spatialfuncs.c` source file for the [2782d6d](https://github.com/HopkinsIDD/IDSpatialStats/commit/2782d6dcc9ee4be9855b5e468ce789425b81d49a "Commit 2782d6d on 17 Dec 2018") commit), coded the four features summarised below, then ran it in an R script by sourcing it with `Rcpp::sourceCpp()` and then calling `getTau()` without needing `library(IDSpatialStats)` on [lines 6-10](https://github.com/t-pollington/tau-statistic-speedup/blob/master/run_get_tau.R#L6). I have provided commented R script file `run_get_tau.R` (in the root) and the C file `get_tau.cpp` (in the /src of the package) containing useful comments. 1. **Stop calls to R within C** (~26x speedup) -*Currently*: Previously the R function `get.tau()` would call the `get_tau()` C function on lines [403-449](https://github.com/HopkinsIDD/IDSpatialStats/blob/master/src/spatialfuncs.c#L403 +*Previously*: Previously the R function `get.tau()` would call the `get_tau()` C function on lines [403-449](https://github.com/HopkinsIDD/IDSpatialStats/blob/master/src/spatialfuncs.c#L403 ) (and internally `get_pi()` on [line 427](https://github.com/HopkinsIDD/IDSpatialStats/blob/master/src/spatialfuncs.c#L427 )). My `getTau()` function skips that step for easier reading here, not contributing to the speedup; so in essence the heart of the code isolated was described by `get_pi()` on [lines 64-148](https://github.com/HopkinsIDD/IDSpatialStats/blob/master/src/spatialfuncs.c#L64 ). `get_pi()` has 3 nested loops: over distance windows, then a double loop for paired links between people. The slowdown occurs at [lines 126-129](https://github.com/HopkinsIDD/IDSpatialStats/blob/master/src/spatialfuncs.c#L126 @@ -37,19 +37,25 @@ Rather than running `IDSpatialStats::get.tau()` function in an R script as descr 2. **Stop repeat evaluations of undirected edges** (~2x speedup) -*Currently*: In the sum over all people the same link will be visited twice ie *i*->*j* and *j*->*i* but this isn't necessary as is symmetric to *i*,*j* switching due to the modulus function. In most temporally-related or serotype-shared scenarios the pairs are undirected and so the summation should really be 'upper triangular' style i.e. . +*Previously*: In the sum over all people the same link will be visited twice ie *i*->*j* and *j*->*i* but this isn't necessary as is symmetric to *i*,*j* switching due to the modulus function. In most temporally-related or serotype-shared scenarios the pairs are undirected and so the summation should really be 'upper triangular' style i.e. . *Change*: Understandably by halving the loop we get a ~2x speedup. -3. **Split the `posmat` data matrix into multiple vectors** (~20% speedup) +3. **Explicitly force the compiler to use AVX2 CPU vectorisation** (~47% speedup) -*Currently*: `posmat` is the matrix argument that gets fed into `get.tau()`. As the loops sequentially go through `i`, `j` & `k`, if posmat is large columnwise then the next row's value in memory may not be that close to the current location and this extra memory access time will add delays. +*Previously*: The standard compiler flags are '-O2' and it is unclear if the loops were fully vectorised given the `if(...) continue;` statements within them. + +*Change*: Since the first version I have built the .cpp file into a minimal package using `Rcpp::Rcpp.package.skeleton()` with empty description and help files. The purpose is that the _configure_ file can set the 'g++' compiler flags to utilise '-O3' optimisation and '-mavx2' vectorisation which is common in CPU architectures; to tweak to your architecture please read [How to change and set Rcpp compile arguments](https://stackoverflow.com/questions/32586455/how-to-change-and-set-rcpp-compile-arguments). All `if()` statements in the inner loops have been replaced with boolean arithmetic. + +4. **Split the `posmat` data matrix into multiple vectors** (~20% speedup) + +*Previously*: `posmat` is the matrix argument that gets fed into `get.tau()`. As the loops sequentially go through `i`, `j` & `k`, if posmat is large columnwise then the next row's value in memory may not be that close to the current location and this extra memory access time will add delays. *Change*: Using vectors for each variable guarantees that the next observation for a variable will be next in memory. -4. **Work with squared distances to avoid `sqrt()`** (negligible speedup) +5. **Work with squared distances to avoid `sqrt()`** (negligible speedup) -*Currently*: To calculate the distance separation *d_ij* a Euclidean distance was calculated. +*Previously*: To calculate the distance separation *d_ij* a Euclidean distance was calculated. *Change*: Working with squared distance ranges can make `sqrt()` redundant. @@ -59,13 +65,13 @@ Although I have found enormous speedup improvements (constructing 500 bootstrapp ## Replication Unfortunately I can't share the dataset but can describe what you need: -* R v3.5.1 +* R v3.6.0 * library `Rcpp` for `sourceCpp()`. `IDSpatialStats` isn't required. * data = R `matrix`-type object with columns named: "ORIG_ID"; "x"; "y"; "onset" and no missing data. For non-cases, the "onset" column should be numerically coded as -999. ## Features not implemented -* parallel computations across the `for(i){}` loop for *i* in `get_tau.cpp`. I tried using parallel packages in R and C's `#pragma omp parallel for` with `#include ` but to no avail. -* GPU computations. A good starting place for rapid code development would be MATLAB's `gpuArray` class. +* parallel computations across the `for(i){}` loop for *i* in `get_tau.cpp`. I tried to no avail using 'RcppArmadillo' to enable `#pragma omp parallel for` within `get_tau.cpp` as there are [concerns that `Rcpp` alone can't deal with OpenMP](https://stackoverflow.com/questions/48069990/multithreaded-simd-vectorized-mandelbrot-in-r-using-rcpp-openmp). Applying `#pragma omp simd` to the loops after the compiler optimisations above gave no speed improvement; there were difficulties applying it since although the for loop over *i* goes from 1 to N the nested *j* loop is of different length which depends on *i*. +* GPU computations. A good starting place for rapid code development would be MATLAB's `gpuArray` class or ArrayFire. My only reservation is the code would need adapting so the distance matrix is calculated on-the-fly only once rather than over the k-loop for the size of the radius intervals required. This would help reduce the amount of GPU device memory accesses from the GPU cores. * separating distance calculations from the Rfun code didn't lead to a speedup. Even though distances are needless calculated multiple times, the slowdown probably comes from storing these distances in slower-to-access caches, when a just-in-time method works faster. * **Have you found an even faster way to do this? I'm open in principle to pull requests to this repo but message me to check.** * **Found a bug or even a typo? I'd love to know! We're not perfect and we should all be open to criticism so we can do the best science for infectious disease modelling.** @@ -73,4 +79,4 @@ Unfortunately I can't share the dataset but can describe what you need: ## References and credits 1. Lessler J, Salje H, Grabowski MK, Cummings DAT. *Measuring Spatial Dependence for Infectious Disease Epidemiology*. PLoS One 2016; 11: 5–1–13. doi: [10.1371/journal.pone.0155249](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0155249). 2. `HopkinsIDD/IDSpatialStats` (development repo for R's IDSpatialStats package) maintained by @jlessler and @gilesjohnr. Code from the [CRAN read-only mirror](https://github.com/cran/IDSpatialStats) hasn't been used as it is several months behind the development repo. -* Thanks to [CodeCogs](https://www.codecogs.com/latex/eqneditor.php "CodeCogs LaTeX equation editor, just copy+paste the HTML they provide") for renderring the mathematical formulae, let's hope they don't close down their site—Github **still** doesn't consider renderring LaTeX in README.md is a core functionality of code development! +* Thanks to [CodeCogs](https://www.codecogs.com/latex/eqneditor.php "CodeCogs LaTeX equation editor, just copy+paste the HTML they provide") for renderring the mathematical formulae, let's hope they don't close down their site—Github **still** doesn't consider renderring LaTeX in README.md is a core functionality of code development! \ No newline at end of file diff --git a/anRpackage/DESCRIPTION b/anRpackage/DESCRIPTION deleted file mode 100644 index 88c7e8b..0000000 --- a/anRpackage/DESCRIPTION +++ /dev/null @@ -1,12 +0,0 @@ -Package: anRpackage -Type: Package -Title: What the Package Does in One 'Title Case' Line -Version: 1.0 -Date: 2019-08-06 -Author: Your Name -Maintainer: Your Name -Description: One paragraph description of what the package does as one or more full - sentences. -License: GPL (>= 2) -Imports: Rcpp (>= 1.0.1) -LinkingTo: Rcpp diff --git a/anRpackage/NAMESPACE b/anRpackage/NAMESPACE deleted file mode 100644 index 0ca1d67..0000000 --- a/anRpackage/NAMESPACE +++ /dev/null @@ -1,3 +0,0 @@ -useDynLib(anRpackage, .registration=TRUE) -exportPattern("^[[:alpha:]]+") -importFrom(Rcpp, evalCpp) diff --git a/anRpackage/R/RcppExports.R b/anRpackage/R/RcppExports.R deleted file mode 100644 index 1bbcd19..0000000 --- a/anRpackage/R/RcppExports.R +++ /dev/null @@ -1,11 +0,0 @@ -# Generated by using Rcpp::compileAttributes() -> do not edit by hand -# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 - -getTau <- function(ORIG_ID, x, y, onset, r, r_low, index) { - .Call(`_anRpackage_getTau`, ORIG_ID, x, y, onset, r, r_low, index) -} - -rcpp_hello_world <- function() { - .Call(`_anRpackage_rcpp_hello_world`) -} - diff --git a/anRpackage/Read-and-delete-me b/anRpackage/Read-and-delete-me deleted file mode 100644 index 6d6236d..0000000 --- a/anRpackage/Read-and-delete-me +++ /dev/null @@ -1,9 +0,0 @@ -* Edit the help file skeletons in 'man', possibly combining help files for multiple - functions. -* Edit the exports in 'NAMESPACE', and add necessary imports. -* Put any C/C++/Fortran code in 'src'. -* If you have compiled code, add a useDynLib() directive to 'NAMESPACE'. -* Run R CMD build to build the package tarball. -* Run R CMD check to check the package tarball. - -Read "Writing R Extensions" for more information. diff --git a/anRpackage/cleanup b/anRpackage/cleanup deleted file mode 100755 index 9a8dcf9..0000000 --- a/anRpackage/cleanup +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -if [ -e "~/.R/Makevars.bak_CustomConfig" ]; then - mv -f ~/.R/Makevars.bak_CustomConfig ~/.R/Makevars -fi \ No newline at end of file diff --git a/anRpackage/configure b/anRpackage/configure deleted file mode 100755 index 1839ac8..0000000 --- a/anRpackage/configure +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -if [ ! -d "~/.R" ]; then - mkdir ~/.R; touch ~/.R/Makevars - echo "CXXFLAGS= -O3 -std=c++11 -Wall -mavx2" > ~/.R/Makevars -elif [ ! -e "~/.R/Makevars" ]; then - touch ~/.R/Makevars - echo "CXXFLAGS= -O3 -std=c++11 -Wall -mavx2" > ~/.R/Makevars -else - mv ~/.R/Makevars ~/.R/Makevars.bak_CustomConfig - echo "CXXFLAGS= -O3 -std=c++11 -Wall -mavx2" > ~/.R/Makevars -fi \ No newline at end of file diff --git a/anRpackage/man/anRpackage-package.Rd b/anRpackage/man/anRpackage-package.Rd deleted file mode 100644 index a23ac20..0000000 --- a/anRpackage/man/anRpackage-package.Rd +++ /dev/null @@ -1,34 +0,0 @@ -\name{anRpackage-package} -\alias{anRpackage-package} -\alias{anRpackage} -\docType{package} -\title{ - A short title line describing what the package does -} -\description{ - A more detailed description of what the package does. A length - of about one to five lines is recommended. -} -\details{ - This section should provide a more detailed overview of how to use the - package, including the most important functions. -} -\author{ -Your Name, email optional. - -Maintainer: Your Name -} -\references{ - This optional section can contain literature or other references for - background information. -} -\keyword{ package } -\seealso{ - Optional links to other man pages -} -\examples{ - \dontrun{ - ## Optional simple examples of the most important functions - ## These can be in \dontrun{} and \donttest{} blocks. - } -} diff --git a/anRpackage/man/rcpp_hello_world.Rd b/anRpackage/man/rcpp_hello_world.Rd deleted file mode 100644 index e4f90bf..0000000 --- a/anRpackage/man/rcpp_hello_world.Rd +++ /dev/null @@ -1,17 +0,0 @@ -\name{rcpp_hello_world} -\alias{rcpp_hello_world} -\docType{package} -\title{ -Simple function using Rcpp -} -\description{ -Simple function using Rcpp -} -\usage{ -rcpp_hello_world() -} -\examples{ -\dontrun{ -rcpp_hello_world() -} -} diff --git a/anRpackage/src/RcppExports.cpp b/anRpackage/src/RcppExports.cpp deleted file mode 100644 index c909457..0000000 --- a/anRpackage/src/RcppExports.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// Generated by using Rcpp::compileAttributes() -> do not edit by hand -// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 - -#include - -using namespace Rcpp; - -// getTau -NumericVector getTau(const NumericVector ORIG_ID, const NumericVector x, const NumericVector y, const NumericVector onset, const NumericVector r, const NumericVector r_low, SEXP index); -RcppExport SEXP _anRpackage_getTau(SEXP ORIG_IDSEXP, SEXP xSEXP, SEXP ySEXP, SEXP onsetSEXP, SEXP rSEXP, SEXP r_lowSEXP, SEXP indexSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const NumericVector >::type ORIG_ID(ORIG_IDSEXP); - Rcpp::traits::input_parameter< const NumericVector >::type x(xSEXP); - Rcpp::traits::input_parameter< const NumericVector >::type y(ySEXP); - Rcpp::traits::input_parameter< const NumericVector >::type onset(onsetSEXP); - Rcpp::traits::input_parameter< const NumericVector >::type r(rSEXP); - Rcpp::traits::input_parameter< const NumericVector >::type r_low(r_lowSEXP); - Rcpp::traits::input_parameter< SEXP >::type index(indexSEXP); - rcpp_result_gen = Rcpp::wrap(getTau(ORIG_ID, x, y, onset, r, r_low, index)); - return rcpp_result_gen; -END_RCPP -} -// rcpp_hello_world -List rcpp_hello_world(); -RcppExport SEXP _anRpackage_rcpp_hello_world() { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - rcpp_result_gen = Rcpp::wrap(rcpp_hello_world()); - return rcpp_result_gen; -END_RCPP -} - -static const R_CallMethodDef CallEntries[] = { - {"_anRpackage_getTau", (DL_FUNC) &_anRpackage_getTau, 7}, - {"_anRpackage_rcpp_hello_world", (DL_FUNC) &_anRpackage_rcpp_hello_world, 0}, - {NULL, NULL, 0} -}; - -RcppExport void R_init_anRpackage(DllInfo *dll) { - R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); - R_useDynamicSymbols(dll, FALSE); -} diff --git a/anRpackage/src/RcppExports.o b/anRpackage/src/RcppExports.o deleted file mode 100644 index e349a89..0000000 Binary files a/anRpackage/src/RcppExports.o and /dev/null differ diff --git a/anRpackage/src/anRpackage.so b/anRpackage/src/anRpackage.so deleted file mode 100755 index 298ce61..0000000 Binary files a/anRpackage/src/anRpackage.so and /dev/null differ diff --git a/anRpackage/src/get_tau.cpp b/anRpackage/src/get_tau.cpp deleted file mode 100644 index 883e597..0000000 --- a/anRpackage/src/get_tau.cpp +++ /dev/null @@ -1,77 +0,0 @@ -//Credit: Inspired and amended from https://github.com/HopkinsIDD/IDSpatialStats/commit/2782d6dcc9ee4be9855b5e468ce789425b81d49a by @gilesjohnr @jlessler -//Author: @t-pollington -//Date: 6 Aug 2019 -#include -#include -#include -#define NTYPE unsigned short //fine if N<=65535 -using namespace std; -using namespace Rcpp; -// [[Rcpp::export]] - -NumericVector getTau(const NumericVector ORIG_ID, const NumericVector x, const NumericVector y, const NumericVector onset, const NumericVector r, const NumericVector r_low, SEXP index){ //see how multiple vector arguments are parsed in here rather than the single posmat matrix - IntegerVector ORIG_IDint = as(ORIG_ID); - IntegerVector onsetint = as(onset); - NTYPE i,j,k; - float dist2 = 0; - float r2 = 0; - float r2_low = 0; - unsigned long num_cnt, denom_cnt; //counters for those filling conditions// - unsigned short serialintvl = 7; //mean serial interval of the disease - unsigned short r_size = r.size(); - NTYPE N = ORIG_ID.size(); - int *inds = INTEGER(index); - NumericVector tau(r_size, NULL); - float piInf = 0; - bool check = 1; - bool bstrapconflict = 0; - bool sameperson = 0; - bool iscasepair = 0; - bool temprelated = 0; - bool withindist = 0; - - if(*inds==-1){ //if index was set to -1 then it means we can turn off bootstrapping checks - check = 0; - } - - num_cnt = 0; - denom_cnt = 0; - - //calc piInf - for (i=0;i= r2_low) && (dist2 < r2)); - sameperson = (ORIG_IDint[i]==ORIG_IDint[j]); - denom_cnt = denom_cnt + (!(bstrapconflict)*!(sameperson)*withindist); - iscasepair = (onsetint[i]!=-999) && (onsetint[j]!=-999); - temprelated = (abs(onsetint[i]-onsetint[j]) <= serialintvl); - num_cnt = num_cnt + (!(bstrapconflict)*!(sameperson)*iscasepair*temprelated*withindist); - } - } - tau[k] = (float)num_cnt/denom_cnt; // pi(r.min,r.max) - tau[k] = (float)tau[k]/piInf; - } - return(tau); -} \ No newline at end of file diff --git a/anRpackage/src/get_tau.o b/anRpackage/src/get_tau.o deleted file mode 100644 index 771e71e..0000000 Binary files a/anRpackage/src/get_tau.o and /dev/null differ diff --git a/anRpackage/src/rcpp_hello_world.cpp b/anRpackage/src/rcpp_hello_world.cpp deleted file mode 100644 index 98a959c..0000000 --- a/anRpackage/src/rcpp_hello_world.cpp +++ /dev/null @@ -1,13 +0,0 @@ - -#include -using namespace Rcpp; - -// [[Rcpp::export]] -List rcpp_hello_world() { - - CharacterVector x = CharacterVector::create( "foo", "bar" ) ; - NumericVector y = NumericVector::create( 0.0, 1.0 ) ; - List z = List::create( x, y ) ; - - return z ; -} diff --git a/anRpackage/src/rcpp_hello_world.o b/anRpackage/src/rcpp_hello_world.o deleted file mode 100644 index a446de8..0000000 Binary files a/anRpackage/src/rcpp_hello_world.o and /dev/null differ diff --git a/get_tau.cpp b/get_tau.cpp index 1c0e28d..2d12c6d 100644 --- a/get_tau.cpp +++ b/get_tau.cpp @@ -1,6 +1,6 @@ //Credit: Inspired and amended from https://github.com/HopkinsIDD/IDSpatialStats/commit/2782d6dcc9ee4be9855b5e468ce789425b81d49a by @gilesjohnr @jlessler //Author: @t-pollington -//Date: 29 Jan 2019 +//Date: 6 Aug 2019 #include #include #include @@ -10,66 +10,68 @@ using namespace Rcpp; // [[Rcpp::export]] NumericVector getTau(const NumericVector ORIG_ID, const NumericVector x, const NumericVector y, const NumericVector onset, const NumericVector r, const NumericVector r_low, SEXP index){ //see how multiple vector arguments are parsed in here rather than the single posmat matrix + IntegerVector ORIG_IDint = as(ORIG_ID); + IntegerVector onsetint = as(onset); NTYPE i,j,k; - double dist2 = 0; - double r2 = 0; - double r2_low = 0; - long long num_cnt, denom_cnt; //counters for those filling conditions// + float dist2 = 0; + float r2 = 0; + float r2_low = 0; + unsigned long num_cnt, denom_cnt; //counters for those filling conditions// unsigned short serialintvl = 7; //mean serial interval of the disease unsigned short r_size = r.size(); NTYPE N = ORIG_ID.size(); int *inds = INTEGER(index); NumericVector tau(r_size, NULL); - double piInf = 0; + float piInf = 0; bool check = 1; bool bstrapconflict = 0; bool sameperson = 0; bool iscasepair = 0; bool temprelated = 0; bool withindist = 0; - -if(*inds==-1){ //if index was set to -1 then it means we can turn off bootstrapping checks - check = 0; -} - -num_cnt = 0; -denom_cnt = 0; - -//calc piInf -for (i=0;i= r2_low) && (dist2 < r2)); - sameperson = (ORIG_ID[i]==ORIG_ID[j]); - denom_cnt = denom_cnt + (!(bstrapconflict)*!(sameperson)*withindist); - iscasepair = (onset[i]!=-999) && (onset[j]!=-999); - temprelated = (abs(onset[i]-onset[j]) <= serialintvl); - num_cnt = num_cnt + (!(bstrapconflict)*!(sameperson)*iscasepair*temprelated*withindist); + for (j=0; j= r2_low) && (dist2 < r2)); + sameperson = (ORIG_IDint[i]==ORIG_IDint[j]); + denom_cnt = denom_cnt + (!(bstrapconflict)*!(sameperson)*withindist); + iscasepair = (onsetint[i]!=-999) && (onsetint[j]!=-999); + temprelated = (abs(onsetint[i]-onsetint[j]) <= serialintvl); + num_cnt = num_cnt + (!(bstrapconflict)*!(sameperson)*iscasepair*temprelated*withindist); + } } + tau[k] = (float)num_cnt/denom_cnt; // pi(r.min,r.max) + tau[k] = (float)tau[k]/piInf; } - tau[k] = (double)num_cnt/denom_cnt; // pi(r.min,r.max) - tau[k] = (double)tau[k]/piInf; -} -return(tau); + return(tau); } \ No newline at end of file diff --git a/run_get_tau.R b/run_get_tau.R index f5cc411..8bbb278 100644 --- a/run_get_tau.R +++ b/run_get_tau.R @@ -1,10 +1,16 @@ -rm(list = ls()) #clean up from before -library(Rcpp) #sourceCpp() -setwd(FolderWhereYouStoreYourData) -load(file = "YourData.RData") #say you have called your matrix object "X" -setwd(FolderWhereYouStoreYourTauCFile) -sourceCpp("get_tau.cpp") - -r.max = seq(50,500,50) #upper bound of each bin, in metres -r.min = rep.int(0,length(r.max)) #lower bound of each bin -getTau(X[,"ORIG_ID"],X[,"x"],X[,"y"],X[,"onset"],r.max,r.min,as.integer(-1)) #note that as we want the point estimate of tau we set the last argument to -1. You could adapt this function for bootstrapping by parsing in a vector of indices for the last argument. +rm(list = ls()) #clean up from before +library(Rcpp) #sourceCpp() +setwd(FolderWhereYouStoreYourData) +load(file = "YourData.RData") #say you have called your matrix object "X" +setwd(FolderWhereYouStoreYourTauCFile) +sourceCpp("get_tau.cpp") + +r.max = seq(50,500,50) #upper bound of each bin, in metres +r.min = rep.int(0,length(r.max)) #lower bound of each bin +ptm <- proc.time() +for (i in 1:10) { + getTau(X.NW[,"ORIG_IDenum"],X.NW[,"x"],X.NW[,"y"],X.NW[,"KA"],r.max,r.min,as.integer(-1)) #note that as we want the point estimate of tau we set the last argument to -1. You could adapt this function for bootstrapping by parsing in a vector of indices for the last argument. +} +timing = (proc.time() - ptm) +timing/10 #2.0578s for commit 5dceac9 get_tau.cpp +#2.03s for NTYPE change to unsigned short \ No newline at end of file diff --git a/run_get_taupersonal.R b/run_get_taupersonal.R deleted file mode 100644 index b856ec7..0000000 --- a/run_get_taupersonal.R +++ /dev/null @@ -1,42 +0,0 @@ -rm(list = ls()) #clean up from before -library(Rcpp) #sourceCpp() -setwd(FolderWhereYouStoreYourData) -load(file = "YourData.RData") #say you have called your matrix object "X" -setwd(FolderWhereYouStoreYourTauCFile) -sourceCpp("get_tau.cpp") - -r.max = seq(50,500,50) #upper bound of each bin, in metres -r.min = rep.int(0,length(r.max)) #lower bound of each bin -ptm <- proc.time() -for (i in 1:10) { - getTau(X.NW[,"ORIG_IDenum"],X.NW[,"x"],X.NW[,"y"],X.NW[,"KA"],r.max,r.min,as.integer(-1)) #note that as we want the point estimate of tau we set the last argument to -1. You could adapt this function for bootstrapping by parsing in a vector of indices for the last argument. -} -timing = (proc.time() - ptm) -timing/10 #2.0578s for commit 5dceac9 get_tau.cpp -#2.03s for NTYPE change to unsigned short - -ORIG_IDenum <- file("ORIG_IDenum.bin", "wb") -writeBin(X.NW[,"ORIG_IDenum"], ORIG_IDenum,size = integer()) -close(ORIG_IDenum) -x <- file("x.bin", "wb") -writeBin(X.NW[,"x"], x, size = double()) -close(x) -y <- file("y.bin", "wb") -writeBin(X.NW[,"y"], y, size = double()) -close(y) -KA <- file("KA.bin", "wb") -writeBin(X.NW[,"KA"], KA, size = integer()) -close(KA) -r.maxbin <- file("r.max.bin", "wb") -writeBin(r.max, r.maxbin, size = integer()) -close(r.maxbin) -r.minbin <- file("r.min.bin", "wb") -writeBin(r.min, r.minbin, size = integer()) -close(r.minbin) -indx <- file("indx.bin", "wb") -indxvector = as.numeric(sample.int(16221,16221,replace = T)) -writeBin(indxvector, indx, size = integer()) -close(indx) -indx <- file("indx.bin", "rb") -readBin(con = indx,what = 4,n=16221) -close(indx) diff --git a/tau-statistic-speedup.Rproj b/tau-statistic-speedup.Rproj index 8e3c2eb..5bf29f4 100644 --- a/tau-statistic-speedup.Rproj +++ b/tau-statistic-speedup.Rproj @@ -11,3 +11,8 @@ Encoding: UTF-8 RnwWeave: Sweave LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackagePath: anRpackage +PackageInstallArgs: --no-multiarch --with-keep.source diff --git a/testloadbinary.cpp b/testloadbinary.cpp deleted file mode 100644 index 2e9f890..0000000 --- a/testloadbinary.cpp +++ /dev/null @@ -1,328 +0,0 @@ -#include //printf -#include //cout -#include //streampos -#include -#include -#include -#include -#include -#define NTYPE unsigned short //fine if N<=65535 -using namespace std; -using namespace std::chrono; -int main(){ - ifstream fileorig_idenum("ORIG_IDenum.bin", ios::in|ios::binary|ios::ate); - double* orig_idenum; - int idx=16221; - orig_idenum = (double*)malloc((idx+1) * sizeof(orig_idenum[0])); - if(orig_idenum == NULL) - { - printf("not enough memory"); - return 1; - } - if (fileorig_idenum.is_open()) - { - streampos size = fileorig_idenum.tellg(); - cout << "size=" << size << " bytes\n"; - char * memblock = new char [size]; - fileorig_idenum.seekg (0, ios::beg); - fileorig_idenum.read (memblock, size); - fileorig_idenum.close(); - cout << "orig_idenum is in memory \n"; - orig_idenum = (double*)memblock; //reinterpret - } - else - { - cout << "Unable to open file"; - } - printf("orig_idenum[0] = %f\n", orig_idenum[0]); - printf("orig_idenum[1] = %f\n", orig_idenum[1]); - printf("orig_idenum[2] = %f\n", orig_idenum[2]); - printf("orig_idenum[16220] = %f\n", orig_idenum[16220]); - vector orig_idenumv; - for(int i = 0; i xv; - for(int i = 0; i yv; - for(int i = 0; i KAv; - for(int i = 0; i rmaxv; - for(int i = 0; i rminv; - for(int i = 0; i indxv; - for(int i = 0; i tau(r_size, 0); - double piInf = 0; - bool check = 1; - bool bstrapconflict = 0; - bool sameperson = 0; - bool iscasepair = 0; - bool temprelated = 0; - bool withindist = 0; - - printf("N = %d\n", N); - printf("r_size = %d\n", r_size); - - if(indxv[0]==-1){ //if index was set to -1 then it means we can turn off bootstrapping checks - check = 0; - } - - num_cnt = 0; - denom_cnt = 0; - - //calc piInf - //#pragma omp simd collapse(2) reduction(+:num_cnt, +:denom_cnt) - for (i=0;i= r2_low) && (dist2 < r2)); - sameperson = (orig_idenumv[i]==orig_idenumv[j]); - denom_cnt = denom_cnt + (!(bstrapconflict)*!(sameperson)*withindist); - iscasepair = (KAv[i]!=-999) && (KAv[j]!=-999); - temprelated = (abs(KAv[i]-KAv[j]) <= serialintvl); - num_cnt = num_cnt + (!(bstrapconflict)*!(sameperson)*iscasepair*temprelated*withindist); - } - } - tau[k] = (double)num_cnt/denom_cnt; // pi(r.min,r.max) - tau[k] = (double)tau[k]/piInf; - } - auto stop = high_resolution_clock::now(); - auto duration = duration_cast(stop - start); - cout << duration.count() << endl; - for (k=0;k