diff --git a/.gitignore b/.gitignore
index b66fee6..73978b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,5 @@
*.RData
*.Ruserdata
*.bin
-*.out
\ No newline at end of file
+*.out
+.Rproj.user
\ No newline at end of file
diff --git a/README.md b/README.md
index 0fbe69a..ec18b65 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ An optimised implementation of the tau statistic (relative prevalence ratio form
* [References and credits](#References-and-credits)
## The statistic
-I was evaluating the ['elevated prevalence' form](https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0155249.s003&type=supplementary#page=6 "Lessler et al. Appendix 5, p6") of the tau statistic [[Lessler et al]](#References-and-credits) as we had data on the underlying population (i.e. non-cases as well as cases) containing months of disease onset *ti* and UTM coordinates of their household (*xi*,*yi*). I optimised the implementation of the tau statistic from the development repo of the `IDSpatialStats::get.tau()` function, leading to a ~**52x speedup**.
+I was evaluating the ['elevated prevalence' form](https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0155249.s003&type=supplementary#page=6 "Lessler et al. Appendix 5, p6") of the tau statistic [[Lessler et al]](#References-and-credits) as we had data on the underlying population (i.e. non-cases as well as cases) containing months of disease onset *ti* and UTM coordinates of their household (*xi*,*yi*). I optimised the implementation of the tau statistic from the development repo of the `IDSpatialStats::get.tau()` function, leading to a ~**76x speedup**.
@@ -23,11 +23,11 @@ which is the proportion of related case pairs within a specified distance versus
The relatedness of a case pair *zij*, is determined here using temporal information, if then *z_ij*=1 else 0.
## How the speedup was done
-Rather than running `IDSpatialStats::get.tau()` function in an R script as described by `?get.tau()`, I isolated the C function responsible (`get_tau` on [line 403](https://github.com/HopkinsIDD/IDSpatialStats/blob/master/src/spatialfuncs.c#L403) from `spatialfuncs.c` source file for the [2782d6d](https://github.com/HopkinsIDD/IDSpatialStats/commit/2782d6dcc9ee4be9855b5e468ce789425b81d49a "Commit 2782d6d on 17 Dec 2018") commit), coded the four features summarised below, then ran it in an R script by sourcing it with `Rcpp::sourceCpp()` and then calling `getTau()` without needing `library(IDSpatialStats)` on [lines 6-10](https://github.com/t-pollington/tau-statistic-speedup/blob/master/run_get_tau.R#L6). I have provided both the R script file `run_get_tau.R` and the C file `get_tau.cpp` containing useful comments.
+Rather than running `IDSpatialStats::get.tau()` function in an R script as described by `?get.tau()`, I isolated the C function responsible (`get_tau` on [line 403](https://github.com/HopkinsIDD/IDSpatialStats/blob/master/src/spatialfuncs.c#L403) from `spatialfuncs.c` source file for the [2782d6d](https://github.com/HopkinsIDD/IDSpatialStats/commit/2782d6dcc9ee4be9855b5e468ce789425b81d49a "Commit 2782d6d on 17 Dec 2018") commit), coded the four features summarised below, then ran it in an R script by sourcing it with `Rcpp::sourceCpp()` and then calling `getTau()` without needing `library(IDSpatialStats)` on [lines 6-10](https://github.com/t-pollington/tau-statistic-speedup/blob/master/run_get_tau.R#L6). I have provided commented R script file `run_get_tau.R` (in the root) and the C file `get_tau.cpp` (in the /src of the package) containing useful comments.
1. **Stop calls to R within C** (~26x speedup)
-*Currently*: Previously the R function `get.tau()` would call the `get_tau()` C function on lines [403-449](https://github.com/HopkinsIDD/IDSpatialStats/blob/master/src/spatialfuncs.c#L403
+*Previously*: Previously the R function `get.tau()` would call the `get_tau()` C function on lines [403-449](https://github.com/HopkinsIDD/IDSpatialStats/blob/master/src/spatialfuncs.c#L403
) (and internally `get_pi()` on [line 427](https://github.com/HopkinsIDD/IDSpatialStats/blob/master/src/spatialfuncs.c#L427
)). My `getTau()` function skips that step for easier reading here, not contributing to the speedup; so in essence the heart of the code isolated was described by `get_pi()` on [lines 64-148](https://github.com/HopkinsIDD/IDSpatialStats/blob/master/src/spatialfuncs.c#L64
). `get_pi()` has 3 nested loops: over distance windows, then a double loop for paired links between people. The slowdown occurs at [lines 126-129](https://github.com/HopkinsIDD/IDSpatialStats/blob/master/src/spatialfuncs.c#L126
@@ -37,19 +37,25 @@ Rather than running `IDSpatialStats::get.tau()` function in an R script as descr
2. **Stop repeat evaluations of undirected edges** (~2x speedup)
-*Currently*: In the sum over all people the same link will be visited twice ie *i*->*j* and *j*->*i* but this isn't necessary as is symmetric to *i*,*j* switching due to the modulus function. In most temporally-related or serotype-shared scenarios the pairs are undirected and so the summation should really be 'upper triangular' style i.e. .
+*Previously*: In the sum over all people the same link will be visited twice ie *i*->*j* and *j*->*i* but this isn't necessary as is symmetric to *i*,*j* switching due to the modulus function. In most temporally-related or serotype-shared scenarios the pairs are undirected and so the summation should really be 'upper triangular' style i.e. .
*Change*: Understandably by halving the loop we get a ~2x speedup.
-3. **Split the `posmat` data matrix into multiple vectors** (~20% speedup)
+3. **Explicitly force the compiler to use AVX2 CPU vectorisation** (~47% speedup)
-*Currently*: `posmat` is the matrix argument that gets fed into `get.tau()`. As the loops sequentially go through `i`, `j` & `k`, if posmat is large columnwise then the next row's value in memory may not be that close to the current location and this extra memory access time will add delays.
+*Previously*: The standard compiler flags are '-O2' and it is unclear if the loops were fully vectorised given the `if(...) continue;` statements within them.
+
+*Change*: Since the first version I have built the .cpp file into a minimal package using `Rcpp::Rcpp.package.skeleton()` with empty description and help files. The purpose is that the _configure_ file can set the 'g++' compiler flags to utilise '-O3' optimisation and '-mavx2' vectorisation which is common in CPU architectures; to tweak to your architecture please read [How to change and set Rcpp compile arguments](https://stackoverflow.com/questions/32586455/how-to-change-and-set-rcpp-compile-arguments). All `if()` statements in the inner loops have been replaced with boolean arithmetic.
+
+4. **Split the `posmat` data matrix into multiple vectors** (~20% speedup)
+
+*Previously*: `posmat` is the matrix argument that gets fed into `get.tau()`. As the loops sequentially go through `i`, `j` & `k`, if posmat is large columnwise then the next row's value in memory may not be that close to the current location and this extra memory access time will add delays.
*Change*: Using vectors for each variable guarantees that the next observation for a variable will be next in memory.
-4. **Work with squared distances to avoid `sqrt()`** (negligible speedup)
+5. **Work with squared distances to avoid `sqrt()`** (negligible speedup)
-*Currently*: To calculate the distance separation *d_ij* a Euclidean distance was calculated.
+*Previously*: To calculate the distance separation *d_ij* a Euclidean distance was calculated.
*Change*: Working with squared distance ranges can make `sqrt()` redundant.
@@ -59,13 +65,13 @@ Although I have found enormous speedup improvements (constructing 500 bootstrapp
## Replication
Unfortunately I can't share the dataset but can describe what you need:
-* R v3.5.1
+* R v3.6.0
* library `Rcpp` for `sourceCpp()`. `IDSpatialStats` isn't required.
* data = R `matrix`-type object with columns named: "ORIG_ID"; "x"; "y"; "onset" and no missing data. For non-cases, the "onset" column should be numerically coded as -999.
## Features not implemented
-* parallel computations across the `for(i){}` loop for *i* in `get_tau.cpp`. I tried using parallel packages in R and C's `#pragma omp parallel for` with `#include ` but to no avail.
-* GPU computations. A good starting place for rapid code development would be MATLAB's `gpuArray` class.
+* parallel computations across the `for(i){}` loop for *i* in `get_tau.cpp`. I tried to no avail using 'RcppArmadillo' to enable `#pragma omp parallel for` within `get_tau.cpp` as there are [concerns that `Rcpp` alone can't deal with OpenMP](https://stackoverflow.com/questions/48069990/multithreaded-simd-vectorized-mandelbrot-in-r-using-rcpp-openmp). Applying `#pragma omp simd` to the loops after the compiler optimisations above gave no speed improvement; there were difficulties applying it since although the for loop over *i* goes from 1 to N the nested *j* loop is of different length which depends on *i*.
+* GPU computations. A good starting place for rapid code development would be MATLAB's `gpuArray` class or ArrayFire. My only reservation is the code would need adapting so the distance matrix is calculated on-the-fly only once rather than over the k-loop for the size of the radius intervals required. This would help reduce the amount of GPU device memory accesses from the GPU cores.
* separating distance calculations from the Rfun code didn't lead to a speedup. Even though distances are needless calculated multiple times, the slowdown probably comes from storing these distances in slower-to-access caches, when a just-in-time method works faster.
* **Have you found an even faster way to do this? I'm open in principle to pull requests to this repo but message me to check.**
* **Found a bug or even a typo? I'd love to know! We're not perfect and we should all be open to criticism so we can do the best science for infectious disease modelling.**
@@ -73,4 +79,4 @@ Unfortunately I can't share the dataset but can describe what you need:
## References and credits
1. Lessler J, Salje H, Grabowski MK, Cummings DAT. *Measuring Spatial Dependence for Infectious Disease Epidemiology*. PLoS One 2016; 11: 5–1–13. doi: [10.1371/journal.pone.0155249](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0155249).
2. `HopkinsIDD/IDSpatialStats` (development repo for R's IDSpatialStats package) maintained by @jlessler and @gilesjohnr. Code from the [CRAN read-only mirror](https://github.com/cran/IDSpatialStats) hasn't been used as it is several months behind the development repo.
-* Thanks to [CodeCogs](https://www.codecogs.com/latex/eqneditor.php "CodeCogs LaTeX equation editor, just copy+paste the HTML they provide") for renderring the mathematical formulae, let's hope they don't close down their site—Github **still** doesn't consider renderring LaTeX in README.md is a core functionality of code development!
+* Thanks to [CodeCogs](https://www.codecogs.com/latex/eqneditor.php "CodeCogs LaTeX equation editor, just copy+paste the HTML they provide") for renderring the mathematical formulae, let's hope they don't close down their site—Github **still** doesn't consider renderring LaTeX in README.md is a core functionality of code development!
\ No newline at end of file
diff --git a/anRpackage/DESCRIPTION b/anRpackage/DESCRIPTION
deleted file mode 100644
index 88c7e8b..0000000
--- a/anRpackage/DESCRIPTION
+++ /dev/null
@@ -1,12 +0,0 @@
-Package: anRpackage
-Type: Package
-Title: What the Package Does in One 'Title Case' Line
-Version: 1.0
-Date: 2019-08-06
-Author: Your Name
-Maintainer: Your Name
-Description: One paragraph description of what the package does as one or more full
- sentences.
-License: GPL (>= 2)
-Imports: Rcpp (>= 1.0.1)
-LinkingTo: Rcpp
diff --git a/anRpackage/NAMESPACE b/anRpackage/NAMESPACE
deleted file mode 100644
index 0ca1d67..0000000
--- a/anRpackage/NAMESPACE
+++ /dev/null
@@ -1,3 +0,0 @@
-useDynLib(anRpackage, .registration=TRUE)
-exportPattern("^[[:alpha:]]+")
-importFrom(Rcpp, evalCpp)
diff --git a/anRpackage/R/RcppExports.R b/anRpackage/R/RcppExports.R
deleted file mode 100644
index 1bbcd19..0000000
--- a/anRpackage/R/RcppExports.R
+++ /dev/null
@@ -1,11 +0,0 @@
-# Generated by using Rcpp::compileAttributes() -> do not edit by hand
-# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
-
-getTau <- function(ORIG_ID, x, y, onset, r, r_low, index) {
- .Call(`_anRpackage_getTau`, ORIG_ID, x, y, onset, r, r_low, index)
-}
-
-rcpp_hello_world <- function() {
- .Call(`_anRpackage_rcpp_hello_world`)
-}
-
diff --git a/anRpackage/Read-and-delete-me b/anRpackage/Read-and-delete-me
deleted file mode 100644
index 6d6236d..0000000
--- a/anRpackage/Read-and-delete-me
+++ /dev/null
@@ -1,9 +0,0 @@
-* Edit the help file skeletons in 'man', possibly combining help files for multiple
- functions.
-* Edit the exports in 'NAMESPACE', and add necessary imports.
-* Put any C/C++/Fortran code in 'src'.
-* If you have compiled code, add a useDynLib() directive to 'NAMESPACE'.
-* Run R CMD build to build the package tarball.
-* Run R CMD check to check the package tarball.
-
-Read "Writing R Extensions" for more information.
diff --git a/anRpackage/cleanup b/anRpackage/cleanup
deleted file mode 100755
index 9a8dcf9..0000000
--- a/anRpackage/cleanup
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-if [ -e "~/.R/Makevars.bak_CustomConfig" ]; then
- mv -f ~/.R/Makevars.bak_CustomConfig ~/.R/Makevars
-fi
\ No newline at end of file
diff --git a/anRpackage/configure b/anRpackage/configure
deleted file mode 100755
index 1839ac8..0000000
--- a/anRpackage/configure
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-if [ ! -d "~/.R" ]; then
- mkdir ~/.R; touch ~/.R/Makevars
- echo "CXXFLAGS= -O3 -std=c++11 -Wall -mavx2" > ~/.R/Makevars
-elif [ ! -e "~/.R/Makevars" ]; then
- touch ~/.R/Makevars
- echo "CXXFLAGS= -O3 -std=c++11 -Wall -mavx2" > ~/.R/Makevars
-else
- mv ~/.R/Makevars ~/.R/Makevars.bak_CustomConfig
- echo "CXXFLAGS= -O3 -std=c++11 -Wall -mavx2" > ~/.R/Makevars
-fi
\ No newline at end of file
diff --git a/anRpackage/man/anRpackage-package.Rd b/anRpackage/man/anRpackage-package.Rd
deleted file mode 100644
index a23ac20..0000000
--- a/anRpackage/man/anRpackage-package.Rd
+++ /dev/null
@@ -1,34 +0,0 @@
-\name{anRpackage-package}
-\alias{anRpackage-package}
-\alias{anRpackage}
-\docType{package}
-\title{
- A short title line describing what the package does
-}
-\description{
- A more detailed description of what the package does. A length
- of about one to five lines is recommended.
-}
-\details{
- This section should provide a more detailed overview of how to use the
- package, including the most important functions.
-}
-\author{
-Your Name, email optional.
-
-Maintainer: Your Name
-}
-\references{
- This optional section can contain literature or other references for
- background information.
-}
-\keyword{ package }
-\seealso{
- Optional links to other man pages
-}
-\examples{
- \dontrun{
- ## Optional simple examples of the most important functions
- ## These can be in \dontrun{} and \donttest{} blocks.
- }
-}
diff --git a/anRpackage/man/rcpp_hello_world.Rd b/anRpackage/man/rcpp_hello_world.Rd
deleted file mode 100644
index e4f90bf..0000000
--- a/anRpackage/man/rcpp_hello_world.Rd
+++ /dev/null
@@ -1,17 +0,0 @@
-\name{rcpp_hello_world}
-\alias{rcpp_hello_world}
-\docType{package}
-\title{
-Simple function using Rcpp
-}
-\description{
-Simple function using Rcpp
-}
-\usage{
-rcpp_hello_world()
-}
-\examples{
-\dontrun{
-rcpp_hello_world()
-}
-}
diff --git a/anRpackage/src/RcppExports.cpp b/anRpackage/src/RcppExports.cpp
deleted file mode 100644
index c909457..0000000
--- a/anRpackage/src/RcppExports.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-// Generated by using Rcpp::compileAttributes() -> do not edit by hand
-// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
-
-#include
-
-using namespace Rcpp;
-
-// getTau
-NumericVector getTau(const NumericVector ORIG_ID, const NumericVector x, const NumericVector y, const NumericVector onset, const NumericVector r, const NumericVector r_low, SEXP index);
-RcppExport SEXP _anRpackage_getTau(SEXP ORIG_IDSEXP, SEXP xSEXP, SEXP ySEXP, SEXP onsetSEXP, SEXP rSEXP, SEXP r_lowSEXP, SEXP indexSEXP) {
-BEGIN_RCPP
- Rcpp::RObject rcpp_result_gen;
- Rcpp::RNGScope rcpp_rngScope_gen;
- Rcpp::traits::input_parameter< const NumericVector >::type ORIG_ID(ORIG_IDSEXP);
- Rcpp::traits::input_parameter< const NumericVector >::type x(xSEXP);
- Rcpp::traits::input_parameter< const NumericVector >::type y(ySEXP);
- Rcpp::traits::input_parameter< const NumericVector >::type onset(onsetSEXP);
- Rcpp::traits::input_parameter< const NumericVector >::type r(rSEXP);
- Rcpp::traits::input_parameter< const NumericVector >::type r_low(r_lowSEXP);
- Rcpp::traits::input_parameter< SEXP >::type index(indexSEXP);
- rcpp_result_gen = Rcpp::wrap(getTau(ORIG_ID, x, y, onset, r, r_low, index));
- return rcpp_result_gen;
-END_RCPP
-}
-// rcpp_hello_world
-List rcpp_hello_world();
-RcppExport SEXP _anRpackage_rcpp_hello_world() {
-BEGIN_RCPP
- Rcpp::RObject rcpp_result_gen;
- Rcpp::RNGScope rcpp_rngScope_gen;
- rcpp_result_gen = Rcpp::wrap(rcpp_hello_world());
- return rcpp_result_gen;
-END_RCPP
-}
-
-static const R_CallMethodDef CallEntries[] = {
- {"_anRpackage_getTau", (DL_FUNC) &_anRpackage_getTau, 7},
- {"_anRpackage_rcpp_hello_world", (DL_FUNC) &_anRpackage_rcpp_hello_world, 0},
- {NULL, NULL, 0}
-};
-
-RcppExport void R_init_anRpackage(DllInfo *dll) {
- R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
- R_useDynamicSymbols(dll, FALSE);
-}
diff --git a/anRpackage/src/RcppExports.o b/anRpackage/src/RcppExports.o
deleted file mode 100644
index e349a89..0000000
Binary files a/anRpackage/src/RcppExports.o and /dev/null differ
diff --git a/anRpackage/src/anRpackage.so b/anRpackage/src/anRpackage.so
deleted file mode 100755
index 298ce61..0000000
Binary files a/anRpackage/src/anRpackage.so and /dev/null differ
diff --git a/anRpackage/src/get_tau.cpp b/anRpackage/src/get_tau.cpp
deleted file mode 100644
index 883e597..0000000
--- a/anRpackage/src/get_tau.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//Credit: Inspired and amended from https://github.com/HopkinsIDD/IDSpatialStats/commit/2782d6dcc9ee4be9855b5e468ce789425b81d49a by @gilesjohnr @jlessler
-//Author: @t-pollington
-//Date: 6 Aug 2019
-#include
-#include
-#include
-#define NTYPE unsigned short //fine if N<=65535
-using namespace std;
-using namespace Rcpp;
-// [[Rcpp::export]]
-
-NumericVector getTau(const NumericVector ORIG_ID, const NumericVector x, const NumericVector y, const NumericVector onset, const NumericVector r, const NumericVector r_low, SEXP index){ //see how multiple vector arguments are parsed in here rather than the single posmat matrix
- IntegerVector ORIG_IDint = as(ORIG_ID);
- IntegerVector onsetint = as(onset);
- NTYPE i,j,k;
- float dist2 = 0;
- float r2 = 0;
- float r2_low = 0;
- unsigned long num_cnt, denom_cnt; //counters for those filling conditions//
- unsigned short serialintvl = 7; //mean serial interval of the disease
- unsigned short r_size = r.size();
- NTYPE N = ORIG_ID.size();
- int *inds = INTEGER(index);
- NumericVector tau(r_size, NULL);
- float piInf = 0;
- bool check = 1;
- bool bstrapconflict = 0;
- bool sameperson = 0;
- bool iscasepair = 0;
- bool temprelated = 0;
- bool withindist = 0;
-
- if(*inds==-1){ //if index was set to -1 then it means we can turn off bootstrapping checks
- check = 0;
- }
-
- num_cnt = 0;
- denom_cnt = 0;
-
- //calc piInf
- for (i=0;i= r2_low) && (dist2 < r2));
- sameperson = (ORIG_IDint[i]==ORIG_IDint[j]);
- denom_cnt = denom_cnt + (!(bstrapconflict)*!(sameperson)*withindist);
- iscasepair = (onsetint[i]!=-999) && (onsetint[j]!=-999);
- temprelated = (abs(onsetint[i]-onsetint[j]) <= serialintvl);
- num_cnt = num_cnt + (!(bstrapconflict)*!(sameperson)*iscasepair*temprelated*withindist);
- }
- }
- tau[k] = (float)num_cnt/denom_cnt; // pi(r.min,r.max)
- tau[k] = (float)tau[k]/piInf;
- }
- return(tau);
-}
\ No newline at end of file
diff --git a/anRpackage/src/get_tau.o b/anRpackage/src/get_tau.o
deleted file mode 100644
index 771e71e..0000000
Binary files a/anRpackage/src/get_tau.o and /dev/null differ
diff --git a/anRpackage/src/rcpp_hello_world.cpp b/anRpackage/src/rcpp_hello_world.cpp
deleted file mode 100644
index 98a959c..0000000
--- a/anRpackage/src/rcpp_hello_world.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-
-#include
-using namespace Rcpp;
-
-// [[Rcpp::export]]
-List rcpp_hello_world() {
-
- CharacterVector x = CharacterVector::create( "foo", "bar" ) ;
- NumericVector y = NumericVector::create( 0.0, 1.0 ) ;
- List z = List::create( x, y ) ;
-
- return z ;
-}
diff --git a/anRpackage/src/rcpp_hello_world.o b/anRpackage/src/rcpp_hello_world.o
deleted file mode 100644
index a446de8..0000000
Binary files a/anRpackage/src/rcpp_hello_world.o and /dev/null differ
diff --git a/get_tau.cpp b/get_tau.cpp
index 1c0e28d..2d12c6d 100644
--- a/get_tau.cpp
+++ b/get_tau.cpp
@@ -1,6 +1,6 @@
//Credit: Inspired and amended from https://github.com/HopkinsIDD/IDSpatialStats/commit/2782d6dcc9ee4be9855b5e468ce789425b81d49a by @gilesjohnr @jlessler
//Author: @t-pollington
-//Date: 29 Jan 2019
+//Date: 6 Aug 2019
#include
#include
#include
@@ -10,66 +10,68 @@ using namespace Rcpp;
// [[Rcpp::export]]
NumericVector getTau(const NumericVector ORIG_ID, const NumericVector x, const NumericVector y, const NumericVector onset, const NumericVector r, const NumericVector r_low, SEXP index){ //see how multiple vector arguments are parsed in here rather than the single posmat matrix
+ IntegerVector ORIG_IDint = as(ORIG_ID);
+ IntegerVector onsetint = as(onset);
NTYPE i,j,k;
- double dist2 = 0;
- double r2 = 0;
- double r2_low = 0;
- long long num_cnt, denom_cnt; //counters for those filling conditions//
+ float dist2 = 0;
+ float r2 = 0;
+ float r2_low = 0;
+ unsigned long num_cnt, denom_cnt; //counters for those filling conditions//
unsigned short serialintvl = 7; //mean serial interval of the disease
unsigned short r_size = r.size();
NTYPE N = ORIG_ID.size();
int *inds = INTEGER(index);
NumericVector tau(r_size, NULL);
- double piInf = 0;
+ float piInf = 0;
bool check = 1;
bool bstrapconflict = 0;
bool sameperson = 0;
bool iscasepair = 0;
bool temprelated = 0;
bool withindist = 0;
-
-if(*inds==-1){ //if index was set to -1 then it means we can turn off bootstrapping checks
- check = 0;
-}
-
-num_cnt = 0;
-denom_cnt = 0;
-
-//calc piInf
-for (i=0;i= r2_low) && (dist2 < r2));
- sameperson = (ORIG_ID[i]==ORIG_ID[j]);
- denom_cnt = denom_cnt + (!(bstrapconflict)*!(sameperson)*withindist);
- iscasepair = (onset[i]!=-999) && (onset[j]!=-999);
- temprelated = (abs(onset[i]-onset[j]) <= serialintvl);
- num_cnt = num_cnt + (!(bstrapconflict)*!(sameperson)*iscasepair*temprelated*withindist);
+ for (j=0; j= r2_low) && (dist2 < r2));
+ sameperson = (ORIG_IDint[i]==ORIG_IDint[j]);
+ denom_cnt = denom_cnt + (!(bstrapconflict)*!(sameperson)*withindist);
+ iscasepair = (onsetint[i]!=-999) && (onsetint[j]!=-999);
+ temprelated = (abs(onsetint[i]-onsetint[j]) <= serialintvl);
+ num_cnt = num_cnt + (!(bstrapconflict)*!(sameperson)*iscasepair*temprelated*withindist);
+ }
}
+ tau[k] = (float)num_cnt/denom_cnt; // pi(r.min,r.max)
+ tau[k] = (float)tau[k]/piInf;
}
- tau[k] = (double)num_cnt/denom_cnt; // pi(r.min,r.max)
- tau[k] = (double)tau[k]/piInf;
-}
-return(tau);
+ return(tau);
}
\ No newline at end of file
diff --git a/run_get_tau.R b/run_get_tau.R
index f5cc411..8bbb278 100644
--- a/run_get_tau.R
+++ b/run_get_tau.R
@@ -1,10 +1,16 @@
-rm(list = ls()) #clean up from before
-library(Rcpp) #sourceCpp()
-setwd(FolderWhereYouStoreYourData)
-load(file = "YourData.RData") #say you have called your matrix object "X"
-setwd(FolderWhereYouStoreYourTauCFile)
-sourceCpp("get_tau.cpp")
-
-r.max = seq(50,500,50) #upper bound of each bin, in metres
-r.min = rep.int(0,length(r.max)) #lower bound of each bin
-getTau(X[,"ORIG_ID"],X[,"x"],X[,"y"],X[,"onset"],r.max,r.min,as.integer(-1)) #note that as we want the point estimate of tau we set the last argument to -1. You could adapt this function for bootstrapping by parsing in a vector of indices for the last argument.
+rm(list = ls()) #clean up from before
+library(Rcpp) #sourceCpp()
+setwd(FolderWhereYouStoreYourData)
+load(file = "YourData.RData") #say you have called your matrix object "X"
+setwd(FolderWhereYouStoreYourTauCFile)
+sourceCpp("get_tau.cpp")
+
+r.max = seq(50,500,50) #upper bound of each bin, in metres
+r.min = rep.int(0,length(r.max)) #lower bound of each bin
+ptm <- proc.time()
+for (i in 1:10) {
+ getTau(X.NW[,"ORIG_IDenum"],X.NW[,"x"],X.NW[,"y"],X.NW[,"KA"],r.max,r.min,as.integer(-1)) #note that as we want the point estimate of tau we set the last argument to -1. You could adapt this function for bootstrapping by parsing in a vector of indices for the last argument.
+}
+timing = (proc.time() - ptm)
+timing/10 #2.0578s for commit 5dceac9 get_tau.cpp
+#2.03s for NTYPE change to unsigned short
\ No newline at end of file
diff --git a/run_get_taupersonal.R b/run_get_taupersonal.R
deleted file mode 100644
index b856ec7..0000000
--- a/run_get_taupersonal.R
+++ /dev/null
@@ -1,42 +0,0 @@
-rm(list = ls()) #clean up from before
-library(Rcpp) #sourceCpp()
-setwd(FolderWhereYouStoreYourData)
-load(file = "YourData.RData") #say you have called your matrix object "X"
-setwd(FolderWhereYouStoreYourTauCFile)
-sourceCpp("get_tau.cpp")
-
-r.max = seq(50,500,50) #upper bound of each bin, in metres
-r.min = rep.int(0,length(r.max)) #lower bound of each bin
-ptm <- proc.time()
-for (i in 1:10) {
- getTau(X.NW[,"ORIG_IDenum"],X.NW[,"x"],X.NW[,"y"],X.NW[,"KA"],r.max,r.min,as.integer(-1)) #note that as we want the point estimate of tau we set the last argument to -1. You could adapt this function for bootstrapping by parsing in a vector of indices for the last argument.
-}
-timing = (proc.time() - ptm)
-timing/10 #2.0578s for commit 5dceac9 get_tau.cpp
-#2.03s for NTYPE change to unsigned short
-
-ORIG_IDenum <- file("ORIG_IDenum.bin", "wb")
-writeBin(X.NW[,"ORIG_IDenum"], ORIG_IDenum,size = integer())
-close(ORIG_IDenum)
-x <- file("x.bin", "wb")
-writeBin(X.NW[,"x"], x, size = double())
-close(x)
-y <- file("y.bin", "wb")
-writeBin(X.NW[,"y"], y, size = double())
-close(y)
-KA <- file("KA.bin", "wb")
-writeBin(X.NW[,"KA"], KA, size = integer())
-close(KA)
-r.maxbin <- file("r.max.bin", "wb")
-writeBin(r.max, r.maxbin, size = integer())
-close(r.maxbin)
-r.minbin <- file("r.min.bin", "wb")
-writeBin(r.min, r.minbin, size = integer())
-close(r.minbin)
-indx <- file("indx.bin", "wb")
-indxvector = as.numeric(sample.int(16221,16221,replace = T))
-writeBin(indxvector, indx, size = integer())
-close(indx)
-indx <- file("indx.bin", "rb")
-readBin(con = indx,what = 4,n=16221)
-close(indx)
diff --git a/tau-statistic-speedup.Rproj b/tau-statistic-speedup.Rproj
index 8e3c2eb..5bf29f4 100644
--- a/tau-statistic-speedup.Rproj
+++ b/tau-statistic-speedup.Rproj
@@ -11,3 +11,8 @@ Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackagePath: anRpackage
+PackageInstallArgs: --no-multiarch --with-keep.source
diff --git a/testloadbinary.cpp b/testloadbinary.cpp
deleted file mode 100644
index 2e9f890..0000000
--- a/testloadbinary.cpp
+++ /dev/null
@@ -1,328 +0,0 @@
-#include //printf
-#include //cout
-#include //streampos
-#include
-#include
-#include
-#include
-#include
-#define NTYPE unsigned short //fine if N<=65535
-using namespace std;
-using namespace std::chrono;
-int main(){
- ifstream fileorig_idenum("ORIG_IDenum.bin", ios::in|ios::binary|ios::ate);
- double* orig_idenum;
- int idx=16221;
- orig_idenum = (double*)malloc((idx+1) * sizeof(orig_idenum[0]));
- if(orig_idenum == NULL)
- {
- printf("not enough memory");
- return 1;
- }
- if (fileorig_idenum.is_open())
- {
- streampos size = fileorig_idenum.tellg();
- cout << "size=" << size << " bytes\n";
- char * memblock = new char [size];
- fileorig_idenum.seekg (0, ios::beg);
- fileorig_idenum.read (memblock, size);
- fileorig_idenum.close();
- cout << "orig_idenum is in memory \n";
- orig_idenum = (double*)memblock; //reinterpret
- }
- else
- {
- cout << "Unable to open file";
- }
- printf("orig_idenum[0] = %f\n", orig_idenum[0]);
- printf("orig_idenum[1] = %f\n", orig_idenum[1]);
- printf("orig_idenum[2] = %f\n", orig_idenum[2]);
- printf("orig_idenum[16220] = %f\n", orig_idenum[16220]);
- vector orig_idenumv;
- for(int i = 0; i xv;
- for(int i = 0; i yv;
- for(int i = 0; i KAv;
- for(int i = 0; i rmaxv;
- for(int i = 0; i rminv;
- for(int i = 0; i indxv;
- for(int i = 0; i tau(r_size, 0);
- double piInf = 0;
- bool check = 1;
- bool bstrapconflict = 0;
- bool sameperson = 0;
- bool iscasepair = 0;
- bool temprelated = 0;
- bool withindist = 0;
-
- printf("N = %d\n", N);
- printf("r_size = %d\n", r_size);
-
- if(indxv[0]==-1){ //if index was set to -1 then it means we can turn off bootstrapping checks
- check = 0;
- }
-
- num_cnt = 0;
- denom_cnt = 0;
-
- //calc piInf
- //#pragma omp simd collapse(2) reduction(+:num_cnt, +:denom_cnt)
- for (i=0;i= r2_low) && (dist2 < r2));
- sameperson = (orig_idenumv[i]==orig_idenumv[j]);
- denom_cnt = denom_cnt + (!(bstrapconflict)*!(sameperson)*withindist);
- iscasepair = (KAv[i]!=-999) && (KAv[j]!=-999);
- temprelated = (abs(KAv[i]-KAv[j]) <= serialintvl);
- num_cnt = num_cnt + (!(bstrapconflict)*!(sameperson)*iscasepair*temprelated*withindist);
- }
- }
- tau[k] = (double)num_cnt/denom_cnt; // pi(r.min,r.max)
- tau[k] = (double)tau[k]/piInf;
- }
- auto stop = high_resolution_clock::now();
- auto duration = duration_cast(stop - start);
- cout << duration.count() << endl;
- for (k=0;k