Skip to content

Commit

Permalink
Rename R to R-arrow (#68)
Browse files Browse the repository at this point in the history
* should change arrow to show R-arrow

* new arrow benchmarks report solution as R-arrow

* update arrow to R-arrow in a few more places

* Fix remaining issues in Tmonster#10 (#13)

* Fix remaining issues in arrow -> R-arrow rename

* Fix bug in rename code in report.R

The previous code was causing something wild to happen. The changed code is idiomatic code for replacing values in a data.frame based on a condition.

---------

Co-authored-by: Bryce Mecum <[email protected]>
  • Loading branch information
Tmonster and amoeba authored Dec 6, 2023
1 parent 4901623 commit c84afb2
Show file tree
Hide file tree
Showing 15 changed files with 42 additions and 35 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/regression.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, arrow, duckdb, duckdb-latest, datafusion]
solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, R-arrow, duckdb, duckdb-latest, datafusion]
name: Regression Tests solo solutions
runs-on: ubuntu-20.04
env:
Expand Down
6 changes: 3 additions & 3 deletions arrow/groupby-arrow.R → R-arrow/groupby-R-arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ source("./_helpers/helpers.R")
stopifnot(requireNamespace("bit64", quietly=TRUE)) # used in chk to sum numeric columns
.libPaths("./arrow/r-arrow") # tidyverse/dplyr#4641 ## leave it like here in case if this affects arrow pkg as well
suppressPackageStartupMessages({
library("arrow", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
library("dplyr", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
library("arrow", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
library("dplyr", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
})
ver = packageVersion("arrow")
git = ""
task = "groupby"
solution = "arrow"
solution = "R-arrow"
fun = "group_by"
cache = TRUE
on_disk = FALSE
Expand Down
6 changes: 3 additions & 3 deletions arrow/join-arrow.R → R-arrow/join-R-arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ source("./_helpers/helpers.R")

.libPaths("./arrow/r-arrow") # tidyverse/dplyr#4641 ## leave it like here in case if this affects arrow pkg as well
suppressPackageStartupMessages({
library("arrow", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
library("dplyr", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
library("arrow", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
library("dplyr", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
})
ver = packageVersion("arrow")
git = ""
task = "join"
solution = "arrow"
solution = "R-arrow"
cache = TRUE
on_disk = FALSE

Expand Down
6 changes: 6 additions & 0 deletions R-arrow/setup-R-arrow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
set -e

# install stable arrow
mkdir -p ./R-arrow/r-arrow
Rscript -e 'install.packages(c("arrow","dplyr"), lib="./R-arrow/r-arrow")'
2 changes: 1 addition & 1 deletion arrow/upg-arrow.sh → R-arrow/upg-R-arrow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ set -e

# upgrade all packages in arrow library only if new arrow is out
echo 'upgrading arrow...'
Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./arrow/r-arrow")) update.packages(lib.loc="./arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./R-arrow/r-arrow")) update.packages(lib.loc="./R-arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
4 changes: 4 additions & 0 deletions R-arrow/ver-R-arrow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash
set -e

Rscript -e 'v=read.dcf(system.file(package="arrow", lib.loc="./R-arrow/r-arrow", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("R-arrow", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
14 changes: 7 additions & 7 deletions _benchplot/benchplot-dict.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ solution.dict = {list(
"juliads" = list(name=c(short="IMD.jl", long="InMemoryDatasets.jl"), color=c(strong="#b80000", light="#ff1f1f")),
"clickhouse" = list(name=c(short="clickhouse", long="ClickHouse"), color=c(strong="hotpink4", light="hotpink1")),
"polars" = list(name=c(short="polars", long="Polars"), color=c(strong="deepskyblue4", light="deepskyblue3")),
"arrow" = list(name=c(short="arrow", long="Arrow"), color=c(strong="aquamarine3", light="aquamarine1")),
"R-arrow" = list(name=c(short="R-arrow", long="R-arrow"), color=c(strong="aquamarine3", light="aquamarine1")),
"duckdb" = list(name=c(short="duckdb", long="DuckDB"), color=c(strong="#ddcd07", light="#fff100")),
"duckdb-latest" = list(name=c(short="duckdb-latest", long="duckdb-latest"), color=c(strong="#ddcd07", light="#fff100")),
"datafusion" = list(name=c(short="datafusion", long="Datafusion"), color=c(strong="deepskyblue4", light="deepskyblue3"))
Expand Down Expand Up @@ -199,7 +199,7 @@ groupby.syntax.dict = {list(
"regression v1 v2 by id2 id4" = "DF.groupby(['id2','id4']).agg((pl.pearson_corr('v1','v2')**2).alias('r2')).collect()",
"sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6']).agg([pl.sum('v3').alias('v3'), pl.count('v1').alias('count')]).collect()"
)},
"arrow" = {c(
"R-arrow" = {c(
"sum v1 by id1" = "AT %>% group_by(id1) %>% summarise(v1=sum(v1, na.rm=TRUE))",
"sum v1 by id1:id2" = "AT %>% group_by(id1, id2) %>% summarise(v1=sum(v1, na.rm=TRUE))",
"sum v1 mean v3 by id3" = "AT %>% group_by(id3) %>% summarise(v1=sum(v1, na.rm=TRUE), v3=mean(v3, na.rm=TRUE))",
Expand Down Expand Up @@ -260,7 +260,7 @@ groupby.syntax.dict = {list(
"juliads" = list(),
"clickhouse" = list(),
"polars" = list(),
"arrow" = list("Expression row_number() <= 2L not supported in Arrow; pulling data into R" = "max v1 - min v2 by id3", "Expression cor(v1, v2, ... is not supported in arrow; pulling data into R" = "regression v1 v2 by id2 id4"),
"R-arrow" = list("Expression row_number() <= 2L not supported in R-arrow; pulling data into R" = "max v1 - min v2 by id3", "Expression cor(v1, v2, ... is not supported in R-arrow; pulling data into R" = "regression v1 v2 by id2 id4"),
"duckdb" = list(),
"duckdb-latest" = list(),
"datafusion" = list()
Expand Down Expand Up @@ -309,7 +309,7 @@ groupby.data.exceptions = {list(
"polars" = {list(
# "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0") # q10
)},
"arrow" = {list(
"R-arrow" = {list(
# "timeout" = c(), # q10
"internal error" = c("G1_1e8_2e0_0_0", "G1_1e8_1e2_0_1", "G1_1e8_1e2_5_0", "G1_1e9_1e2_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0","G1_1e9_1e1_0_0", # inherits from dplyr
"G1_1e9_2e0_0_0"), # #190
Expand Down Expand Up @@ -413,7 +413,7 @@ join.syntax.dict = {list(
"medium inner on factor" = "DF.merge(medium, on='id5')",
"big inner on int" = "DF.merge(big, on='id3')"
)},
"arrow" = {c(
"R-arrow" = {c(
"small inner on int" = "inner_join(DF, small, by='id1')",
"medium inner on int" = "inner_join(DF, medium, by='id2')",
"medium outer on int" = "left_join(DF, medium, by='id2')",
Expand Down Expand Up @@ -454,7 +454,7 @@ join.query.exceptions = {list(
"juliads" = list(),
"clickhouse" = list(),
"polars" = list(),
"arrow" = list(),
"R-arrow" = list(),
"duckdb" = list(),
"duckdb-latest" = list(),
"datafusion" = list()
Expand Down Expand Up @@ -496,7 +496,7 @@ join.data.exceptions = {list(
"polars" = {list(
"out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")
)},
"arrow" = {list(
"R-arrow" = {list(
"out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1", "J1_1e8_NA_0_0", "J1_1e8_NA_5_0", "J1_1e8_NA_0_1" )#,
# "not yet implemented: #189" = c("J1_1e7_NA_0_0","J1_1e7_NA_5_0","J1_1e7_NA_0_1","J1_1e8_NA_0_0","J1_1e8_NA_5_0","J1_1e8_NA_0_1","J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")
)},
Expand Down
4 changes: 2 additions & 2 deletions _control/solutions.csv
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ clickhouse,groupby
clickhouse,join
polars,groupby
polars,join
arrow,groupby
arrow,join
R-arrow,groupby
R-arrow,join
duckdb,groupby
duckdb,join
duckdb-latest,groupby
Expand Down
2 changes: 1 addition & 1 deletion _launcher/launcher.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ readret = function(x) {
file.ext = function(x) {
ans = switch(
x,
"collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
"collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R",
"pandas"=, "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py",
"clickhouse"="sql",
"juliadf"="jl", "juliads"="jl",
Expand Down
2 changes: 1 addition & 1 deletion _launcher/solution.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ if ("quiet" %in% names(args)) {
file.ext = function(x) {
ans = switch(
x,
"collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
"collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R",
"pandas"="py", "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py",
"clickhouse"="sql",
"juliadf"="jl", "juliads"="jl"
Expand Down
13 changes: 10 additions & 3 deletions _report/report.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ get_report_status_file = function(path=getwd()) {
file.path(path, "report-done")
}
get_report_solutions = function() {
c("collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars","arrow","duckdb", "duckdb-latest", "datafusion")
c("collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars", "duckdb", "duckdb-latest", "datafusion", "arrow", "R-arrow")
}
get_data_levels = function() {
## groupby
Expand Down Expand Up @@ -69,6 +69,9 @@ clean_time = function(d) {
if (nrow(d[!nzchar(version) | is.na(version)]))
stop("timings data contains NA or '' as version field, that should not happen")
old_advanced_groupby_questions = c("median v3 sd v3 by id2 id4","max v1 - min v2 by id2 id4","largest two v3 by id2 id4","regression v1 v2 by id2 id4","sum v3 count by id1:id6")

# replace arrow with R-arrow (see https://github.com/duckdblabs/db-benchmark/pull/66)
d[which(solution == "arrow"),c("solution")] == "R-arrow"
d[!nzchar(git), git := NA_character_
][,"on_disk" := as.logical(on_disk)
][task=="groupby" & solution%in%c("pandas","dask","spark") & batch<1558106628, "out_cols" := NA_integer_
Expand Down Expand Up @@ -243,9 +246,13 @@ transform = function(ld) {
# all ----

time_logs = function(path=getwd()) {
ct = clean_time(load_time(path=getwd()))
lt <- load_time(path=getwd())

ct = clean_time(lt)
d = model_time(ct)
l = model_logs(clean_logs(load_logs(path=path)))
ll <- load_logs(path=path)
ll$solution[ll$solution == "arrow"] <- "R-arrow"
l = model_logs(clean_logs(ll))
q = model_questions(clean_questions(load_questions(path=path)))

lq = merge_logs_questions(l, q)
Expand Down
6 changes: 0 additions & 6 deletions arrow/setup-arrow.sh

This file was deleted.

4 changes: 0 additions & 4 deletions arrow/ver-arrow.sh

This file was deleted.

2 changes: 1 addition & 1 deletion run.conf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# task, used in init-setup-iteration.R
export RUN_TASKS="groupby join"
# solution, used in init-setup-iteration.R
export RUN_SOLUTIONS="collapse data.table juliads juliadf dplyr pandas pydatatable spark dask clickhouse polars arrow duckdb duckdb-latest datafusion"
export RUN_SOLUTIONS="collapse data.table juliads juliadf dplyr pandas pydatatable spark dask clickhouse polars R-arrow duckdb duckdb-latest datafusion"

# flag to upgrade tools, used in run.sh on init
export DO_UPGRADE=false
Expand Down
4 changes: 2 additions & 2 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "h2o" ]]; then ./h2o/upg-h2o.
if [[ "$RUN_SOLUTIONS" =~ "h2o" ]]; then ./h2o/ver-h2o.sh; fi;
if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "polars" ]]; then ./polars/upg-polars.sh; fi;
if [[ "$RUN_SOLUTIONS" =~ "polars" ]]; then ./polars/ver-polars.sh; fi;
if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "arrow" ]]; then ./arrow/upg-arrow.sh; fi;
if [[ "$RUN_SOLUTIONS" =~ "arrow" ]]; then ./arrow/ver-arrow.sh; fi;
if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "R-arrow" ]]; then ./R-arrow/R-upg-arrow.sh; fi;
if [[ "$RUN_SOLUTIONS" =~ "R-arrow" ]]; then ./R-arrow/ver-R-arrow.sh; fi;
if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" == "duckdb" ]]; then ./duckdb/upg-duckdb.sh; fi;
if [[ "$RUN_SOLUTIONS" == "duckdb" ]]; then ./duckdb/ver-duckdb.sh; fi;
if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" == "duckdb-latest" ]]; then ./duckdb-latest/setup-duckdb-latest.sh; fi;
Expand Down

0 comments on commit c84afb2

Please sign in to comment.