Merge pull request #632 from SebKrantz/development

Development
SebKrantz · Sep 7, 2024 · c80aebe · c80aebe
2 parents 812eba3 + e5d69ae
commit c80aebe
Show file tree

Hide file tree

Showing 6 changed files with 96 additions and 85 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# collapse 2.0.16.9000
+
+* In `GRP.default()`, the `"group.starts"` attribute is always returned, even if there is only one group or every observation is its own group. Thanks @JamesThompsonC (#631).  
+
 # collapse 2.0.16
 
 * Fixes an installation bug on some Linux systems (conflicting types) (#613). 

diff --git a/R/GRP.R b/R/GRP.R
@@ -132,10 +132,10 @@ GRP.default <- function(X, by = NULL, sort = .op[["sort"]], decreasing = FALSE,
   if(return.groups) {
       # if unit groups, don't subset rows...
       if(length(gs) == length(o) && (use.group || sorted)) {
-        ust <- NULL
+        ust <- st
         groups <- if(is.list(X)) .Call(C_subsetCols, X, by, FALSE) else `names<-`(list(X), namby)
       } else {
-        ust <- if(use.group || sorted) st else .Call(C_subsetVector, o, st, FALSE) # o[st]
+        ust <- if(use.group || sorted) st else if(length(gs) == length(o)) o else .Call(C_subsetVector, o, st, FALSE) # o[st]
         groups <- if(is.list(X)) .Call(C_subsetDT, X, ust, by, FALSE) else
           `names<-`(list(.Call(C_subsetVector, X, ust, FALSE)), namby) # subsetVector preserves attributes (such as "label")
       }
@@ -150,7 +150,7 @@ GRP.default <- function(X, by = NULL, sort = .op[["sort"]], decreasing = FALSE,
                         groups = groups,
                         group.vars = namby,
                         ordered = c(ordered = sort, sorted = sorted),
-                        order = if(return.order && !use.group) .Call(C_setAttributes, o, ao) else NULL, # `attributes<-`(o, attributes(o)[-2L]) This does a shallow copy on newer R versions # `attr<-`(o, "group.sizes", NULL): This deep-copies it..
+                        order = if(return.order && !use.group) `attributes<-`(o, ao) else NULL, # `attributes<-`(o, attributes(o)[-2L]) This does a shallow copy on newer R versions # `attr<-`(o, "group.sizes", NULL): This deep-copies it..
                         group.starts = ust, # Does not need to be computed by group()
                         call = if(call) match.call() else NULL), "GRP"))
 }

diff --git a/src/data.table_utils.c b/src/data.table_utils.c
@@ -267,16 +267,21 @@ SEXP frankds(SEXP xorderArg, SEXP xstartArg, SEXP xlenArg, SEXP dns) {
   int *xstart = INTEGER(xstartArg), *xlen = INTEGER(xlenArg), *xorder = INTEGER(xorderArg);
   n = length(xorderArg);
   ng = length(xstartArg);
+  if(n > 0 && n == ng && asInteger(dns) == 1) return xorderArg;
   SEXP ans = PROTECT(allocVector(INTSXP, n));
   int *ians = INTEGER(ans);
   if(n > 0) {
     switch(asInteger(dns)) {
     case 0: // Not Sorted
       k=1;
-      for (i = 0; i != ng; i++) {
-        for (j = xstart[i]-1, end = xstart[i]+xlen[i]-1; j < end; j++)
-          ians[xorder[j]-1] = k;
-        k++;
+      if(n == ng) {
+        for (i = 0; i != n; i++) ians[xorder[i]-1] = i+1;
+      } else {
+        for (i = 0; i != ng; i++) {
+          for (j = xstart[i]-1, end = xstart[i]+xlen[i]-1; j < end; j++)
+            ians[xorder[j]-1] = k;
+          k++;
+        }
       }
       break;
     case 1: // Sorted
@@ -286,7 +291,7 @@ SEXP frankds(SEXP xorderArg, SEXP xstartArg, SEXP xlenArg, SEXP dns) {
         k++;
       }
       break;
-    case 2: // This is basically run-length type group-id
+    case 2: // This is basically run-length type group-id: currently not used in collapse!
       for (i = 0; i != ng; i++) {
         k=1;
         for (j = xstart[i]-1, end = xstart[i]+xlen[i]-1; j < end; j++)
@@ -297,7 +302,7 @@ SEXP frankds(SEXP xorderArg, SEXP xstartArg, SEXP xlenArg, SEXP dns) {
     }
   }
   UNPROTECT(1);
-  return(ans);
+  return ans;
 }
 
 // from data.table_assign.c:

diff --git a/src/fmean.c b/src/fmean.c
@@ -208,7 +208,9 @@ void fmean_int_g_impl(double *restrict pout, const int *restrict px, const int n
     R_Free(n);
   } else {
     --pout;
-    for(int i = l; i--; ) pout[pg[i]] += px[i]; // Used to stop loop when all groups passed with NA, but probably no speed gain since groups are mostly ordered.
+    for(int i = l; i--; ) {
+      pout[pg[i]] += px[i] == NA_INTEGER ? NA_REAL : px[i]; // Used to stop loop when all groups passed with NA, but probably no speed gain since groups are mostly ordered.
+    }
     ++pout;
     for(int i = ng; i--; ) pout[i] /= pgs[i];
   }

diff --git a/vignettes/collapse_and_data.table.Rmd b/vignettes/collapse_and_data.table.Rmd
@@ -40,7 +40,7 @@ From version 1.6.0 *collapse* seamlessly handles *data.tables*, permitting refer
 
 Both *data.table* and *collapse* are high-performance packages that work well together. For effective co-use it is helpful to understand where each has its strengths, what one can do what the other cannot, and where they overlap. Therefore this small comparison:
 
-* *data.table* offers an enhanced data frame based class to contain data (including list columns). For this class it provides a concise data manipulation syntax which also includes fast aggregation / slit-apply-combine computing, (rolling, non-equi) joins, keying, reshaping, some time-series functionality like lagging and rolling statistics, set operations on tables and a number of very useful other functions like the fast csv reader, fast switches, list-transpose etc.. *data.table* makes data management, and computations on data very easy and salable, supporting huge datasets in a very memory efficient way. The package caters well to the end user by compressing an enormous amount of functionality into two square brackets `[]`. Some of the exported functions are great for programming and also support other classes, but a lot of the functionality and optimization of *data.table* happens under the hood and can only be accessed through the non-standard evaluation table `[i, j, by]` syntax. This syntax has a cost of about 1-3 milliseconds for each call. Memory efficiency and thread-parallelization make *data.table* the star performer on huge data.   
+* *data.table* offers an enhanced data frame based class to contain data (including list columns). For this class it provides a concise data manipulation syntax which also includes fast aggregation / slit-apply-combine computing, (rolling, non-equi) joins, keying, reshaping, some time-series functionality like lagging and rolling statistics, set operations on tables and a number of very useful other functions like the fast csv reader, fast switches, list-transpose etc.. *data.table* makes data management, and computations on data very easy and scalable, supporting huge datasets in a very memory efficient way. The package caters well to the end user by compressing an enormous amount of functionality into two square brackets `[]`. Some of the exported functions are great for programming and also support other classes, but a lot of the functionality and optimization of *data.table* happens under the hood and can only be accessed through the non-standard evaluation table `[i, j, by]` syntax. This syntax has a cost of about 1-3 milliseconds for each call. Memory efficiency and thread-parallelization make *data.table* the star performer on huge data.   
 
 * *collapse* is class-agnostic in nature, supporting vectors, matrices, data frames and non-destructively handling most R classes and objects. It focuses on advanced statistical computing, proving fast column-wise grouped and weighted statistical functions, fast and complex data aggregation and transformations, linear fitting, time series and panel data computations, advanced summary statistics, and recursive processing of lists of data objects. It also includes powerful functions for data manipulation, grouping / factor generation, recoding, handling outliers and missing values. The package default for missing values is `na.rm = TRUE`, which is implemented efficiently in C/C++ in all functions. *collapse* supports both *tidyverse* (piped) and base R / standard evaluation programming. It makes accessible most of it's internal C/C++ based functionality (like grouping objects). *collapse*'s R functions are simple and strongly optimized, i.e. they access the serial C/C++ code quickly, resulting in baseline execution speeds of 10-50 microseconds. All of this makes *collapse* ideal for advanced statistical computing on matrices and larger datasets, and tasks requiring fast programs with repeated function executions.