Skip to content

Update haupt.yml

Update haupt.yml #152

Workflow file for this run

name: GitHub-Hauptaktion
on:
push:
pull_request:
schedule:
- cron: "0 3 17 * *"
defaults:
run:
shell: bash
env:
SKIP_HYRISE: false
SKIP_MONETDB: false
SKIP_DUCKDB: false
SCALE_FACTOR: 0.5
CMAKE_GENERATOR: Ninja
jobs:
hyrise_full_pipeline:
name: Hyrise - Full calibration and evaluation pipeline
runs-on: ubuntu-24.04
outputs:
core_count: ${{ steps.core_client_counts.outputs.core_count }}
client_count: ${{ steps.core_client_counts.outputs.client_count }}
calibration_run: ${{ steps.calibration.outputs.calibration_run }}
steps:
- uses: actions/checkout@master
with:
submodules: recursive
- name: Install dependencies for Act setup
if: ${{ env.ACT }}
run: |
sudo apt-get update -y -qq
sudo apt-get install -y -qq git build-essential cmake python3-pip
- name: Install dependencies
run: |
sudo apt-get update -y -qq
sudo add-apt-repository ppa:deadsnakes/ppa --yes
sudo apt search postgresql-server-dev
# We don't use Hyrise's install_dependencies script as it includes much more than needed for this small setup here.
sudo apt-get install -y ninja-build libboost-all-dev postgresql-server-dev-16 libtbb-dev libsqlite3-dev systemtap-sdt-dev lld numactl python3.11-full python3.11-venv
python3.11 -m venv ~/venv
source ~/venv/bin/activate
#python -m ensurepip --upgrade
#python -m pip install --upgrade pip
#python -m pip install --upgrade setuptools
#pip3 install setuptools # --quiet ... needed for latest Python version, re-check later
python -m pip install -r python/requirements.txt # --quiet
- name: Determine core and client counts for database comparison
id: core_client_counts
run: |
core_count=`grep -Pc '^processor\t' /proc/cpuinfo`
client_count=$(python -c "import math; print(int(math.ceil(${core_count}*0.75)))")
comparison_runtime=$(python -c "print(min(1800, max(300, int(${{ env.SCALE_FACTOR }}*3500))))")
echo "Using ${core_count} cores and ${client_count} clients, comparison benchmarks running for ${comparison_runtime} seconds."
echo "CORE_COUNT=${core_count}" >> $GITHUB_ENV
echo "CLIENT_COUNT=${client_count}" >> $GITHUB_ENV
echo "COMPARISON_RUNTIME=${comparison_runtime}" >> $GITHUB_ENV
echo "core_count=${core_count}" >> $GITHUB_OUTPUT
echo "client_count=${client_count}" >> $GITHUB_OUTPUT
- name: Build release server and plugins
if: env.SKIP_HYRISE == 'false'
run: |
mkdir -p encoding_plugin/rel
pushd encoding_plugin/rel > /dev/null
# Erase all encoding types. Hurts performance but allows us to compile in release mode with GitHub runners.
cmake -DCMAKE_BUILD_TYPE=Release -DHAVE_POSIX_REGEX=0 -DTHREADS_PREFER_PTHREAD_FLAG=1 -DCMAKE_THREAD_LIBS_INIT="-lpthread" -DCMAKE_HAVE_THREADS_LIBRARY=1 -DCMAKE_USE_PTHREADS_INIT=1 -DERASE_SEGMENT_TYPES=Dictionary,LZ4,RunLength,FSST,FrameOfReference,Unencoded,FixedStringDictionary ..
cmake --build . --target hyriseServer WorkloadStatisticsPlugin WorkloadHandlerPlugin CommandExecutorPlugin DataCharacteristicsPlugin
popd > /dev/null
- name: Run calibration - data collection phase (TPC-H only)
if: env.SKIP_HYRISE == 'false'
run: |
# We have a custom CMake target that might not trigger correctly. Since we don't use TPC-DS in this GitHub Action
# run here, creating an empty directory should be fine.
rm -rf encoding_plugin/rel/resources || true # mkdir -p does not work wity symlinks. Just get it done.
mkdir -p encoding_plugin/rel/resources/benchmark/tpcds/tpcds-result-reproduction/query_qualification
pushd python > /dev/null
python3 runner.py --hyrise_server_path=../encoding_plugin/rel/ --base_benchmark_runs=1 --single_benchmark=TPC-H --execute=calibration --scale_factor ${{ env.SCALE_FACTOR }} --random_encoding_configs_count=3
popd > /dev/null
- name: Run calibration - learn runtime and size models
id: calibration
if: env.SKIP_HYRISE == 'false'
run: |
pushd python > /dev/null
calibration_run=`ls -t calibration | grep -v 'results' | head -n1`
# Run pipeline without selection.
python3 encoding_selection_pipeline.py --calibration_dir=calibration/${calibration_run} --skip_phases selection
popd > /dev/null
echo "calibration_run=${calibration_run}" >> $GITHUB_OUTPUT
echo "CALIBRATION_RUN=${calibration_run}" >> $GITHUB_ENV
- name: Run encoding selection
if: env.SKIP_HYRISE == 'false'
run: |
pushd python > /dev/null
# Run selection. For simplicity: use calibration workload as workload to optimize.
python3 encoding_selection_pipeline.py --calibration_dir=calibration/${{ env.CALIBRATION_RUN }} --use_calibration_as_workload --skip_phases load_csv prepare learn_runtime learn_size --budget_steps_stretch_factor 5.0
popd > /dev/null
- name: Benchmark encoding configurations
if: env.SKIP_HYRISE == 'false'
run: |
pushd python
python3 runner.py --hyrise_server_path=../encoding_plugin/rel/ --execute=evaluation --configurations_dir "evaluation/${{ env.CALIBRATION_RUN }}/configurations__default/TPCH" --results_dir "evaluation/${{ env.CALIBRATION_RUN }}/results/TPCH" --scale_factor ${{ env.SCALE_FACTOR }} --single_benchmark=TPCH --port 5551
popd
- name: Benchmark non-constrained Hyrise (database comparison)
if: env.SKIP_HYRISE == 'false'
run: |
pushd python
python3 db_comparison_runner.py hyrise --hyrise_server_path=../encoding_plugin/rel/ --cores=${{ env.CORE_COUNT }} --scale_factor=${{ env.SCALE_FACTOR }} --clients=${{ env.CLIENT_COUNT }} --time=${{ env.COMPARISON_RUNTIME }}
python3 db_comparison_runner.py hyrise --hyrise_server_path=../encoding_plugin/rel/ --cores=${{ env.CORE_COUNT }} --scale_factor=${{ env.SCALE_FACTOR }} --determine_size_only
popd
- name: Benchmark budget-constrained Hyrise (database comparison)
if: env.SKIP_HYRISE == 'false'
run: |
pushd python
python3 runner.py --hyrise_server_path=../encoding_plugin/rel/ --execute=evaluation --configurations_dir "evaluation/${{ env.CALIBRATION_RUN }}/configurations__default/TPCH/LPCompressionSelection" --results_dir "evaluation/${{ env.CALIBRATION_RUN }}/results/TPCH/LPCompressionSelection" --scale_factor ${{ env.SCALE_FACTOR }} --single_benchmark=TPCH --port 5551 --cores=${{ env.CORE_COUNT }} --clients=${{ env.CLIENT_COUNT }}
popd
- name: Upload benchmark results (non-constrained)
uses: actions/upload-artifact@master
if: env.SKIP_HYRISE == 'false'
with:
name: comparison_results_hyrise_non-constrained
path: |
python/db_comparison_results/*.csv
- name: Upload benchmark results (budget-constrained)
uses: actions/upload-artifact@master
if: env.SKIP_HYRISE == 'false'
with:
name: comparison_results_hyrise_budget-constrained
path: |
python/evaluation/${{ env.CALIBRATION_RUN }}/results/TPCH/*.csv
python/evaluation/${{ env.CALIBRATION_RUN }}/results/TPCH/LPCompressionSelection/*.csv
database_comparison:
name: Database Comparison
runs-on: ubuntu-24.04
outputs:
core_count: ${{ steps.core_client_counts.outputs.core_count }}
client_count: ${{ steps.core_client_counts.outputs.client_count }}
steps:
- uses: actions/checkout@master
- uses: actions/checkout@master
if: env.SKIP_MONETDB == 'false'
with:
token: ${{ secrets.PAT }}
repository: MonetDB/MonetDB
ref: 'Sep2022_7' # checking out the latest tag as the current master does not compile with GCC 11 (as of 2022-11-17)
path: ./MonetDB
- uses: actions/checkout@master
if: env.SKIP_MONETDB == 'false'
with:
token: ${{ secrets.PAT }}
repository: MonetDBSolutions/tpch-scripts
path: ./tpch-scripts
- uses: actions/checkout@master
if: env.SKIP_DUCKDB == 'false'
with:
token: ${{ secrets.PAT }}
repository: electrum/tpch-dbgen
path: ./tpch-dbgen
- name: Determine client and core counts for database comparison
id: core_client_counts
run: |
core_count=`grep -Pc '^processor\t' /proc/cpuinfo`
client_count=$(python -c "import math; print(int(math.ceil(${core_count}*0.75)))")
comparison_runtime=$(python -c "print(min(1800, max(300, int(${{ env.SCALE_FACTOR }}*3500))))")
echo "Using ${core_count} cores and ${client_count} clients, comparison benchmarks running for ${comparison_runtime} seconds."
echo "CORE_COUNT=${core_count}" >> $GITHUB_ENV
echo "CLIENT_COUNT=${client_count}" >> $GITHUB_ENV
echo "COMPARISON_RUNTIME=${comparison_runtime}" >> $GITHUB_ENV
echo "core_count=${core_count}" >> $GITHUB_OUTPUT
echo "client_count=${client_count}" >> $GITHUB_OUTPUT
- name: Install dependencies for Act setup
if: ${{ env.ACT }}
run: |
sudo apt-get update -y -qq
sudo apt-get install -y -qq git build-essential cmake python3-pip
- name: Install dependencies
run: |
sudo apt-get update -y -qq
DEBIAN_FRONTEND=noninteractive sudo apt-get install -y -qq ninja-build libsqlite3-dev postgresql-server-dev-16 numactl bison python3-venv
python3 -m venv ~/venv
source ~/venv/bin/activate
pip3 install -r python/requirements.txt #--quiet
- name: Setup MonetDB
if: env.SKIP_MONETDB == 'false'
run: |
pushd MonetDB
mkdir rel
pushd rel
cmake -DCMAKE_INSTALL_PREFIX=~/monetdb_bin/ -DASSERT=OFF -DCMAKE_BUILD_TYPE=Release .. 1> /dev/null
cmake --build . --target install
echo "${HOME}/monetdb_bin/bin" >> $GITHUB_PATH
popd
popd
- name: Generate TPC-H data set (MonetDB)
if: env.SKIP_MONETDB == 'false'
run: |
mkdir -p monetdb_farm
pushd tpch-scripts
./tpch_build.sh -s ${{ env.SCALE_FACTOR }} -f ~/monetdb_farm
popd
- name: Benchmark MonetDB (database comparison)
if: env.SKIP_MONETDB == 'false'
run: |
pushd python
source ~/venv/bin/activate
python3 db_comparison_runner.py monetdb --cores=${{ env.CORE_COUNT }} --scale_factor=${{ env.SCALE_FACTOR }} --clients=${{ env.CLIENT_COUNT }} --time=${{ env.COMPARISON_RUNTIME }}
python3 db_comparison_runner.py monetdb --cores=${{ env.CORE_COUNT }} --scale_factor=${{ env.SCALE_FACTOR }} --determine_size_only
popd
- name: Generate TPC-H data set (for DuckDB and Umbra)
if: env.SKIP_DUCKDB == 'false'
run: |
pushd tpch-dbgen
make &> /dev/null
./dbgen -s ${{ env.SCALE_FACTOR }} -f
mkdir -p sf${{ env.SCALE_FACTOR }}
mv *.tbl sf${{ env.SCALE_FACTOR }}
popd
mv tpch-dbgen ~
- name: Benchmark DuckDB (database comparison)
if: env.SKIP_DUCKDB == 'false'
run: |
pushd python
source ~/venv/bin/activate
python3 db_comparison_runner.py duckdb --cores=${{ env.CORE_COUNT }} --scale_factor=${{ env.SCALE_FACTOR }} --clients=${{ env.CLIENT_COUNT }} --time=${{ env.COMPARISON_RUNTIME }}
python3 db_comparison_runner.py duckdb --cores=${{ env.CORE_COUNT }} --scale_factor=${{ env.SCALE_FACTOR }} --determine_size_only
popd
- name: Upload benchmark results
uses: actions/upload-artifact@master
if: env.SKIP_DUCKDB == 'false' || env.SKIP_MONETDB == 'false'
with:
name: comparison_results
path: |
python/db_comparison_results/*.csv
plotting:
needs: [hyrise_full_pipeline, database_comparison]
name: Plotting
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@master
- uses: r-lib/actions/setup-r@v2
- uses: actions/download-artifact@master
with:
name: comparison_results
path: results_to_plot
- uses: actions/download-artifact@master
with:
name: comparison_results_hyrise_non-constrained
path: results_to_plot
- uses: actions/download-artifact@master
with:
name: comparison_results_hyrise_budget-constrained
path: results_to_plot
- name: Set environment variables
run: |
echo "HYRISE_CORE_COUNT=${{ needs.hyrise_full_pipeline.outputs.core_count }}" >> $GITHUB_ENV
echo "HYRISE_CLIENT_COUNT=${{ needs.hyrise_full_pipeline.outputs.client_count }}" >> $GITHUB_ENV
echo "CALIBRATION_RUN=${{ needs.hyrise_full_pipeline.outputs.calibration_run }}" >> $GITHUB_ENV
echo "COMPARISON_CORE_COUNT=${{ needs.database_comparison.outputs.core_count }}" >> $GITHUB_ENV
echo "COMPARISON_CLIENT_COUNT=${{ needs.database_comparison.outputs.client_count }}" >> $GITHUB_ENV
# Install R packages (install action did not work with act)
- name: Install dependencies
run: |
install.packages(c(
"dplyr",
"ggplot2",
"ggrepel"
))
shell: Rscript {0}
- name: Plot
run: |
source("R/plot.R")
shell: Rscript {0}
- name: Upload database comparison plot
uses: actions/upload-artifact@master
with:
name: database_comparison
path: |
db_comparison.pdf