Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New run duckdb 1.1 #96

Merged
merged 21 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/regression.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:

- name: Install all solutions
shell: bash
run: source path.env && python3 _utils/install_all_solutions.py ${{ matrix.solution }}
run: source path.env && python3 _setup_utils/install_all_solutions.py ${{ matrix.solution }}

- name: Turn swap off
shell: bash
Expand Down Expand Up @@ -135,7 +135,7 @@ jobs:

- name: Install all solutions
shell: bash
run: source path.env && python3 _utils/install_all_solutions.py all
run: source path.env && python3 _setup_utils/install_all_solutions.py all

- name: Turn swap off
shell: bash
Expand Down
1 change: 1 addition & 0 deletions R-arrow/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
17.0.0.1
12 changes: 8 additions & 4 deletions _benchplot/benchplot-dict.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ solution.dict = {list(
"spark" = list(name=c(short="spark", long="spark"), color=c(strong="#8000FFFF", light="#CC66FF")),
"dask" = list(name=c(short="dask", long="dask"), color=c(strong="slategrey", light="lightgrey")),
"juliadf" = list(name=c(short="DF.jl", long="DataFrames.jl"), color=c(strong="deepskyblue", light="darkturquoise")),
"juliads" = list(name=c(short="IMD.jl", long="InMemoryDatasets.jl"), color=c(strong="#b80000", light="#ff1f1f")),
"juliads" = list(name=c(short="IMD.jl", long="InMemData.jl"), color=c(strong="#b80000", light="#ff1f1f")),
"clickhouse" = list(name=c(short="clickhouse", long="ClickHouse"), color=c(strong="hotpink4", light="hotpink1")),
"polars" = list(name=c(short="polars", long="Polars"), color=c(strong="deepskyblue4", light="deepskyblue3")),
"R-arrow" = list(name=c(short="R-arrow", long="R-arrow"), color=c(strong="aquamarine3", light="aquamarine1")),
Expand Down Expand Up @@ -299,10 +299,12 @@ groupby.data.exceptions = {list(
"G1_1e8_2e0_0_0") # q3
)},
"juliadf" = {list(
"timeout" = "G1_1e8_2e0_0_0",
"out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0") # CSV.File
# "timeout" = "G1_1e8_2e0_0_0",
# "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0"), # CSV.File
"CSV import Segfault: JuliaLang#55765" = c("G1_1e7_1e2_0_0","G1_1e7_1e1_0_0","G1_1e7_2e0_0_0","G1_1e7_1e2_0_1","G1_1e7_1e2_5_0","G1_1e8_1e2_0_0","G1_1e8_1e1_0_0","G1_1e8_2e0_0_0","G1_1e8_1e2_0_1","G1_1e8_1e2_5_0","G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0")
)},
"juliads" = {list(
"CSV import Segfault: JuliaLang#55765" = c("G1_1e7_1e2_0_0","G1_1e7_1e1_0_0","G1_1e7_2e0_0_0","G1_1e7_1e2_0_1","G1_1e7_1e2_5_0","G1_1e8_1e2_0_0","G1_1e8_1e1_0_0","G1_1e8_2e0_0_0","G1_1e8_1e2_0_1","G1_1e8_1e2_5_0","G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0")
)},
"clickhouse" = {list(
)},
Expand Down Expand Up @@ -485,9 +487,11 @@ join.data.exceptions = {list(
"out of memory" = c("J1_1e9_NA_0_0") # q1 even when using on-disk, after 47m (480m timeout)
)},
"juliadf" = {list(
"out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1") # CSV.File
# "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1") # CSV.File
"CSV import Segfault: JuliaLang#55765" = c("J1_1e7_NA_0_0", "J1_1e7_NA_5_0", "J1_1e7_NA_0_1", "J1_1e8_NA_0_0", "J1_1e8_NA_5_0", "J1_1e8_NA_0_1", "J1_1e9_NA_0_0")
)},
"juliads" = {list(
"CSV import Segfault: JuliaLang#55765" = c("J1_1e7_NA_0_0", "J1_1e7_NA_5_0", "J1_1e7_NA_0_1", "J1_1e8_NA_0_0", "J1_1e8_NA_5_0", "J1_1e8_NA_0_1", "J1_1e9_NA_0_0")
)},
"clickhouse" = {list(
)},
Expand Down
4 changes: 2 additions & 2 deletions _report/index.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ loop_benchplot(dt_join, report_name="join", syntax.dict=join.syntax.dict, except
![](./join/J1_1e7_NA_0_0_advanced.png)
-->

#### 5 GB {.active}
#### 5 GB

##### **basic questions**

Expand All @@ -158,7 +158,7 @@ loop_benchplot(dt_join, report_name="join", syntax.dict=join.syntax.dict, except
![](./join/J1_1e8_NA_0_0_advanced.png)
-->

#### 50 GB
#### 50 GB {.active}

##### **basic questions**

Expand Down
5 changes: 5 additions & 0 deletions _run/partitioned_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# set machine type
./_run/run_small_medium_groupby_join.sh

./_run/run_large_groupby_join.sh

31 changes: 31 additions & 0 deletions _run/run_large_groupby_join.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# download and expand large data

# get groupby large (0.5GB and 5GB datasets)
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_large.duckdb data/groupby_large.duckdb
# get join small (0.5GB and 5GB datasets)
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_large.duckdb data/join_large.duckdb


# expand groupby-small datasets to csv
duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_0 to 'data/G1_1e9_1e2_0_0.csv' (FORMAT CSV)"
duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e1_0_0 to 'data/G1_1e9_1e1_0_0.csv' (FORMAT CSV)"
duckdb data/groupby_large.duckdb -c "copy G1_1e9_2e0_0_0 to 'data/G1_1e9_2e0_0_0.csv' (FORMAT CSV)"
duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_1 to 'data/G1_1e9_1e2_0_1.csv' (FORMAT CSV)"
duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_5_0 to 'data/G1_1e9_1e2_5_0.csv' (FORMAT CSV)"

# expand join-small datasets to csv
duckdb data/join_large.duckdb -c "copy J1_1e9_NA_0_0 to 'data/J1_1e9_NA_0_0.csv' (FORMAT CSV)"
duckdb data/join_large.duckdb -c "copy J1_1e9_1e9_0_0 to 'data/J1_1e9_1e9_0_0.csv' (FORMAT CSV)"
duckdb data/join_large.duckdb -c "copy J1_1e9_1e6_0_0 to 'data/J1_1e9_1e6_0_0.csv' (FORMAT CSV)"
duckdb data/join_large.duckdb -c "copy J1_1e9_1e3_0_0 to 'data/J1_1e9_1e3_0_0.csv' (FORMAT CSV)"


echo "Running all solutions on large (50GB) datasets"
./run.sh


###
echo "done..."
echo "removing data files"
#rm data/*.csv
#rm data/*.duckdb
59 changes: 59 additions & 0 deletions _run/run_small_medium_groupby_join.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# first download and expand small data

# get groupby small (0.5GB and 5GB datasets)
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_small.duckdb data/groupby_small.duckdb
# get join small (0.5GB and 5GB datasets)
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_small.duckdb data/join_small.duckdb


# expand groupby-small datasets to csv
duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_0_0 to 'data/G1_1e7_1e2_0_0.csv' (FORMAT CSV)"
duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e1_0_0 to 'data/G1_1e7_1e1_0_0.csv' (FORMAT CSV)"
duckdb data/groupby_small.duckdb -c "copy G1_1e7_2e0_0_0 to 'data/G1_1e7_2e0_0_0.csv' (FORMAT CSV)"
duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_0_1 to 'data/G1_1e7_1e2_0_1.csv' (FORMAT CSV)"
duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_5_0 to 'data/G1_1e7_1e2_5_0.csv' (FORMAT CSV)"
duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_0_0 to 'data/G1_1e8_1e2_0_0.csv' (FORMAT CSV)"
duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e1_0_0 to 'data/G1_1e8_1e1_0_0.csv' (FORMAT CSV)"
duckdb data/groupby_small.duckdb -c "copy G1_1e8_2e0_0_0 to 'data/G1_1e8_2e0_0_0.csv' (FORMAT CSV)"
duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_0_1 to 'data/G1_1e8_1e2_0_1.csv' (FORMAT CSV)"
duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_5_0 to 'data/G1_1e8_1e2_5_0.csv' (FORMAT CSV)"

# expand join-small datasets to csv
duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_0_0 to 'data/J1_1e7_1e1_0_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_5_0 to 'data/J1_1e7_1e4_5_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e7_NA_0_1 to 'data/J1_1e7_NA_0_1.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_0_0 to 'data/J1_1e8_1e5_0_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_5_0 to 'data/J1_1e8_1e8_5_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_0_1 to 'data/J1_1e7_1e1_0_1.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_0_0 to 'data/J1_1e7_1e7_0_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e7_NA_5_0 to 'data/J1_1e7_NA_5_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_0_1 to 'data/J1_1e8_1e5_0_1.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e8_NA_0_0 to 'data/J1_1e8_NA_0_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_5_0 to 'data/J1_1e7_1e1_5_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_0_1 to 'data/J1_1e7_1e7_0_1.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_0_0 to 'data/J1_1e8_1e2_0_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_5_0 to 'data/J1_1e8_1e5_5_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e8_NA_0_1 to 'data/J1_1e8_NA_0_1.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_0_0 to 'data/J1_1e7_1e4_0_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_5_0 to 'data/J1_1e7_1e7_5_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_0_1 to 'data/J1_1e8_1e2_0_1.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_0_0 to 'data/J1_1e8_1e8_0_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e8_NA_5_0 to 'data/J1_1e8_NA_5_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_0_1 to 'data/J1_1e7_1e4_0_1.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e7_NA_0_0 to 'data/J1_1e7_NA_0_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_5_0 to 'data/J1_1e8_1e2_5_0.csv' (FORMAT CSV)"
duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_0_1 to 'data/J1_1e8_1e8_0_1.csv' (FORMAT CSV)"


cp _control/data_small.csv _control/data.csv


echo "Running all solutions on small (0.5GB and 5GB) datasets"
./run.sh


###
echo "done..."
echo "removing small data files"
rm data/*.csv
rm data/*.duckdb
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion clickhouse/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
23.10.4.25
24.8.4.13
2 changes: 1 addition & 1 deletion collapse/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0.3
2.0.16
2 changes: 1 addition & 1 deletion dask/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2023.10.0
2024.9.0
2 changes: 1 addition & 1 deletion dask/setup-dask.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
set -e

virtualenv dask/py-dask --python=python3.10
virtualenv dask/py-dask --python=python3.12
source dask/py-dask/bin/activate

# install binaries
Expand Down
2 changes: 1 addition & 1 deletion datafusion/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
31.0.0
41.0.0
2 changes: 1 addition & 1 deletion datatable/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.14.9
1.16.99
2 changes: 1 addition & 1 deletion dplyr/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.1.3
1.1.4
2 changes: 1 addition & 1 deletion duckdb-latest/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.9.1.1
1.0.99.9000
2 changes: 1 addition & 1 deletion duckdb/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.0.0
1.1.0
2 changes: 1 addition & 1 deletion duckdb/setup-duckdb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Rscript -e 'install.packages("DBI", lib="./duckdb/r-duckdb", repos = "http://clo
cd duckdb
git clone https://github.com/duckdb/duckdb-r.git
cd duckdb-r
git checkout v1.0.0
git checkout v1.1.0
cd ..
ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb" duckdb-r
Expand Down
10 changes: 5 additions & 5 deletions juliadf/setup-juliadf.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# install julia

wget https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.4-linux-x86_64.tar.gz
tar -xvf julia-1.10.4-linux-x86_64.tar.gz
sudo mv julia-1.10.4 /opt
rm julia-1.10.4-linux-x86_64.tar.gz
wget https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.5-linux-x86_64.tar.gz
tar -xvf julia-1.10.5-linux-x86_64.tar.gz
sudo mv julia-1.10.5 /opt
rm julia-1.10.5-linux-x86_64.tar.gz
# put to paths
echo 'export JULIA_HOME=/opt/julia-1.10.4' >> path.env
echo 'export JULIA_HOME=/opt/julia-1.10.5' >> path.env
echo 'export PATH=$PATH:$JULIA_HOME/bin' >> path.env
# note that cron job must have path updated as well

Expand Down
2 changes: 1 addition & 1 deletion juliads/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.7.18
0.7.21
16 changes: 7 additions & 9 deletions juliads/setup-juliads.sh
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@

# install julia
wget https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.4-linux-x86_64.tar.gz
tar -xvf julia-1.10.4-linux-x86_64.tar.gz
sudo mv julia-1.10.4 /opt
rm julia-1.10.4-linux-x86_64.tar.gz

wget https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.5-linux-x86_64.tar.gz
tar -xvf julia-1.10.5-linux-x86_64.tar.gz
sudo mv julia-1.10.5 /opt
rm julia-1.10.5-linux-x86_64.tar.gz
# put to paths
echo 'export JULIA_HOME=/opt/julia-1.10.4' >> path.env
echo 'export JULIA_HOME=/opt/julia-1.10.5' >> path.env
echo 'export PATH=$PATH:$JULIA_HOME/bin' >> path.env
echo "export JULIA_NUM_THREADS=40" >> path.env
# note that cron job must have path updated as well

source path.env

# install julia InMemoryDatasets and csv packages
julia -q -e 'using Pkg; Pkg.add(["InMemoryDatasets","DLMReader", "PooledArrays", "Arrow"])'
julia -q -e 'using Pkg; Pkg.add(["InMemoryDatasets","DLMReader", "PooledArrays", "Arrow", "CSV"])'
julia -q -e 'include("$(pwd())/_helpers/helpersds.jl"); pkgmeta = getpkgmeta("InMemoryDatasets"); println(string(pkgmeta["version"])); pkgmeta = getpkgmeta("DLMReader"); println(string(pkgmeta["version"]))'

./juliadf/ver-juliads.sh
./juliadf/ver-juliadf.sh
Loading
Loading