Skip to content

TPC-H Benchmarks

TPC-H Benchmarks #57

Workflow file for this run

name: TPC-H Benchmarks
on:
workflow_dispatch:
inputs:
scale:
description: 'Scale Factor'
required: true
default: 1000
type: choice
options:
- 10000
- 1000
- 100
- 10
- 1
dask:
description: 'Dask'
required: true
default: true
type: boolean
duckdb:
description: 'DuckDB'
required: true
default: true
type: boolean
polars:
description: 'Polars'
required: true
default: false
type: boolean
pyspark:
description: 'PySpark'
required: true
default: true
type: boolean
concurrency:
# Only allow a single run at a time to rate limiting
group: tpch
defaults:
# Required shell entrypoint to have properly activated conda environments
run:
shell: bash -l {0}
jobs:
tpch:
name: TPC-H
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up environment
uses: conda-incubator/setup-miniconda@v3
with:
miniforge-variant: Mambaforge
use-mamba: true
condarc-file: ci/condarc
python-version: "3.9"
environment-file: ci/environment.yml
- name: Pip Compile
run: pip-compile ci/requirements-2tpch-non-dask.in
- name: Add Environment dependencies
run: python -m pip install -r ci/requirements-2tpch-non-dask.txt
- name: Dump environment
run: |
# For debugging
echo -e "--\n--Conda Environment (re-create this with \`conda env create --name <name> -f <output_file>\`)\n--"
mamba env export | grep -E -v '^prefix:.*$'
- name: Add Dask to benchmark if enabled
if: ${{ inputs.dask }}
run: |
echo PYTEST_BENCHMARKS="${{ env.PYTEST_BENCHMARKS }} tests/tpch/test_dask.py" >> $GITHUB_ENV
- name: Add DuckDB to benchmark if enabled
if: ${{ inputs.duckdb }}
run: |
echo PYTEST_BENCHMARKS="${{ env.PYTEST_BENCHMARKS }} tests/tpch/test_duckdb.py" >> $GITHUB_ENV
- name: Add Polars to benchmark if enabled
if: ${{ inputs.polars }}
run: |
echo PYTEST_BENCHMARKS="${{ env.PYTEST_BENCHMARKS }} tests/tpch/test_polars.py" >> $GITHUB_ENV
- name: Add PySpark to benchmark if enabled
if: ${{ inputs.pyspark }}
run: |
echo PYTEST_BENCHMARKS="${{ env.PYTEST_BENCHMARKS }} tests/tpch/test_pyspark.py" >> $GITHUB_ENV
- name: Run TPC-H benchmarks (except polars)
env:
DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }}
AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }}
AWS_DEFAULT_REGION: us-east-2 # this is needed for boto for some reason
AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }}
COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
DB_NAME: tpch_${{ inputs.scale }}.db
CLUSTER_DUMP: always
DASK_DATAFRAME__QUERY_PLANNING: True
run: |
pytest --benchmark \
${{ env.PYTEST_BENCHMARKS }} \
-n 4 --dist loadscope \
--scale ${{ inputs.scale }}
- name: Upload benchmark results
uses: actions/upload-artifact@v4
if: always()
with:
name: tpch-benchmark
path: |
tpch_${{ inputs.scale }}.db
mamba_env_export.yml