Update haupt.yml #152

Workflow file for this run

	name: GitHub-Hauptaktion
	on:
	push:
	pull_request:
	schedule:
	- cron: "0 3 17 * *"

	defaults:
	run:
	shell: bash

	env:
	SKIP_HYRISE: false
	SKIP_MONETDB: false
	SKIP_DUCKDB: false
	SCALE_FACTOR: 0.5
	CMAKE_GENERATOR: Ninja

	jobs:
	hyrise_full_pipeline:
	name: Hyrise - Full calibration and evaluation pipeline
	runs-on: ubuntu-24.04

	outputs:
	core_count: ${{ steps.core_client_counts.outputs.core_count }}
	client_count: ${{ steps.core_client_counts.outputs.client_count }}
	calibration_run: ${{ steps.calibration.outputs.calibration_run }}

	steps:
	- uses: actions/checkout@master
	with:
	submodules: recursive

	- name: Install dependencies for Act setup
	if: ${{ env.ACT }}
	run: \|
	sudo apt-get update -y -qq
	sudo apt-get install -y -qq git build-essential cmake python3-pip

	- name: Install dependencies
	run: \|
	sudo apt-get update -y -qq
	sudo add-apt-repository ppa:deadsnakes/ppa --yes
	sudo apt search postgresql-server-dev
	# We don't use Hyrise's install_dependencies script as it includes much more than needed for this small setup here.
	sudo apt-get install -y ninja-build libboost-all-dev postgresql-server-dev-16 libtbb-dev libsqlite3-dev systemtap-sdt-dev lld numactl python3.11-full python3.11-venv
	python3.11 -m venv ~/venv
	source ~/venv/bin/activate
	#python -m ensurepip --upgrade
	#python -m pip install --upgrade pip
	#python -m pip install --upgrade setuptools
	#pip3 install setuptools # --quiet ... needed for latest Python version, re-check later
	python -m pip install -r python/requirements.txt # --quiet

	- name: Determine core and client counts for database comparison
	id: core_client_counts
	run: \|
	core_count=`grep -Pc '^processor\t' /proc/cpuinfo`
	client_count=$(python -c "import math; print(int(math.ceil(${core_count}*0.75)))")
	comparison_runtime=$(python -c "print(min(1800, max(300, int(${{ env.SCALE_FACTOR }}*3500))))")
	echo "Using ${core_count} cores and ${client_count} clients, comparison benchmarks running for ${comparison_runtime} seconds."

	echo "CORE_COUNT=${core_count}" >> $GITHUB_ENV
	echo "CLIENT_COUNT=${client_count}" >> $GITHUB_ENV
	echo "COMPARISON_RUNTIME=${comparison_runtime}" >> $GITHUB_ENV

	echo "core_count=${core_count}" >> $GITHUB_OUTPUT
	echo "client_count=${client_count}" >> $GITHUB_OUTPUT

	- name: Build release server and plugins
	if: env.SKIP_HYRISE == 'false'
	run: \|
	mkdir -p encoding_plugin/rel
	pushd encoding_plugin/rel > /dev/null
	# Erase all encoding types. Hurts performance but allows us to compile in release mode with GitHub runners.
	cmake -DCMAKE_BUILD_TYPE=Release -DHAVE_POSIX_REGEX=0 -DTHREADS_PREFER_PTHREAD_FLAG=1 -DCMAKE_THREAD_LIBS_INIT="-lpthread" -DCMAKE_HAVE_THREADS_LIBRARY=1 -DCMAKE_USE_PTHREADS_INIT=1 -DERASE_SEGMENT_TYPES=Dictionary,LZ4,RunLength,FSST,FrameOfReference,Unencoded,FixedStringDictionary ..
	cmake --build . --target hyriseServer WorkloadStatisticsPlugin WorkloadHandlerPlugin CommandExecutorPlugin DataCharacteristicsPlugin
	popd > /dev/null

	- name: Run calibration - data collection phase (TPC-H only)
	if: env.SKIP_HYRISE == 'false'
	run: \|
	# We have a custom CMake target that might not trigger correctly. Since we don't use TPC-DS in this GitHub Action
	# run here, creating an empty directory should be fine.
	rm -rf encoding_plugin/rel/resources \|\| true # mkdir -p does not work wity symlinks. Just get it done.
	mkdir -p encoding_plugin/rel/resources/benchmark/tpcds/tpcds-result-reproduction/query_qualification

	pushd python > /dev/null
	python3 runner.py --hyrise_server_path=../encoding_plugin/rel/ --base_benchmark_runs=1 --single_benchmark=TPC-H --execute=calibration --scale_factor ${{ env.SCALE_FACTOR }} --random_encoding_configs_count=3
	popd > /dev/null

	- name: Run calibration - learn runtime and size models
	id: calibration
	if: env.SKIP_HYRISE == 'false'
	run: \|
	pushd python > /dev/null
	calibration_run=`ls -t calibration \| grep -v 'results' \| head -n1`
	# Run pipeline without selection.
	python3 encoding_selection_pipeline.py --calibration_dir=calibration/${calibration_run} --skip_phases selection
	popd > /dev/null

	echo "calibration_run=${calibration_run}" >> $GITHUB_OUTPUT
	echo "CALIBRATION_RUN=${calibration_run}" >> $GITHUB_ENV

	- name: Run encoding selection
	if: env.SKIP_HYRISE == 'false'
	run: \|
	pushd python > /dev/null
	# Run selection. For simplicity: use calibration workload as workload to optimize.
	python3 encoding_selection_pipeline.py --calibration_dir=calibration/${{ env.CALIBRATION_RUN }} --use_calibration_as_workload --skip_phases load_csv prepare learn_runtime learn_size --budget_steps_stretch_factor 5.0
	popd > /dev/null

	- name: Benchmark encoding configurations
	if: env.SKIP_HYRISE == 'false'
	run: \|
	pushd python
	python3 runner.py --hyrise_server_path=../encoding_plugin/rel/ --execute=evaluation --configurations_dir "evaluation/${{ env.CALIBRATION_RUN }}/configurations__default/TPCH" --results_dir "evaluation/${{ env.CALIBRATION_RUN }}/results/TPCH" --scale_factor ${{ env.SCALE_FACTOR }} --single_benchmark=TPCH --port 5551
	popd

	- name: Benchmark non-constrained Hyrise (database comparison)
	if: env.SKIP_HYRISE == 'false'
	run: \|
	pushd python
	python3 db_comparison_runner.py hyrise --hyrise_server_path=../encoding_plugin/rel/ --cores=${{ env.CORE_COUNT }} --scale_factor=${{ env.SCALE_FACTOR }} --clients=${{ env.CLIENT_COUNT }} --time=${{ env.COMPARISON_RUNTIME }}
	python3 db_comparison_runner.py hyrise --hyrise_server_path=../encoding_plugin/rel/ --cores=${{ env.CORE_COUNT }} --scale_factor=${{ env.SCALE_FACTOR }} --determine_size_only
	popd

	- name: Benchmark budget-constrained Hyrise (database comparison)
	if: env.SKIP_HYRISE == 'false'
	run: \|
	pushd python
	python3 runner.py --hyrise_server_path=../encoding_plugin/rel/ --execute=evaluation --configurations_dir "evaluation/${{ env.CALIBRATION_RUN }}/configurations__default/TPCH/LPCompressionSelection" --results_dir "evaluation/${{ env.CALIBRATION_RUN }}/results/TPCH/LPCompressionSelection" --scale_factor ${{ env.SCALE_FACTOR }} --single_benchmark=TPCH --port 5551 --cores=${{ env.CORE_COUNT }} --clients=${{ env.CLIENT_COUNT }}
	popd

	- name: Upload benchmark results (non-constrained)
	uses: actions/upload-artifact@master
	if: env.SKIP_HYRISE == 'false'
	with:
	name: comparison_results_hyrise_non-constrained
	path: \|
	python/db_comparison_results/*.csv

	- name: Upload benchmark results (budget-constrained)
	uses: actions/upload-artifact@master
	if: env.SKIP_HYRISE == 'false'
	with:
	name: comparison_results_hyrise_budget-constrained
	path: \|
	python/evaluation/${{ env.CALIBRATION_RUN }}/results/TPCH/*.csv
	python/evaluation/${{ env.CALIBRATION_RUN }}/results/TPCH/LPCompressionSelection/*.csv


	database_comparison:
	name: Database Comparison
	runs-on: ubuntu-24.04

	outputs:
	core_count: ${{ steps.core_client_counts.outputs.core_count }}
	client_count: ${{ steps.core_client_counts.outputs.client_count }}

	steps:
	- uses: actions/checkout@master

	- uses: actions/checkout@master
	if: env.SKIP_MONETDB == 'false'
	with:
	token: ${{ secrets.PAT }}
	repository: MonetDB/MonetDB
	ref: 'Sep2022_7' # checking out the latest tag as the current master does not compile with GCC 11 (as of 2022-11-17)
	path: ./MonetDB

	- uses: actions/checkout@master
	if: env.SKIP_MONETDB == 'false'
	with:
	token: ${{ secrets.PAT }}
	repository: MonetDBSolutions/tpch-scripts
	path: ./tpch-scripts

	- uses: actions/checkout@master
	if: env.SKIP_DUCKDB == 'false'
	with:
	token: ${{ secrets.PAT }}
	repository: electrum/tpch-dbgen
	path: ./tpch-dbgen

	- name: Determine client and core counts for database comparison
	id: core_client_counts
	run: \|
	core_count=`grep -Pc '^processor\t' /proc/cpuinfo`
	client_count=$(python -c "import math; print(int(math.ceil(${core_count}*0.75)))")
	comparison_runtime=$(python -c "print(min(1800, max(300, int(${{ env.SCALE_FACTOR }}*3500))))")
	echo "Using ${core_count} cores and ${client_count} clients, comparison benchmarks running for ${comparison_runtime} seconds."

	echo "CORE_COUNT=${core_count}" >> $GITHUB_ENV
	echo "CLIENT_COUNT=${client_count}" >> $GITHUB_ENV
	echo "COMPARISON_RUNTIME=${comparison_runtime}" >> $GITHUB_ENV

	echo "core_count=${core_count}" >> $GITHUB_OUTPUT
	echo "client_count=${client_count}" >> $GITHUB_OUTPUT

	- name: Install dependencies for Act setup
	if: ${{ env.ACT }}
	run: \|
	sudo apt-get update -y -qq
	sudo apt-get install -y -qq git build-essential cmake python3-pip

	- name: Install dependencies
	run: \|
	sudo apt-get update -y -qq
	DEBIAN_FRONTEND=noninteractive sudo apt-get install -y -qq ninja-build libsqlite3-dev postgresql-server-dev-16 numactl bison python3-venv
	python3 -m venv ~/venv
	source ~/venv/bin/activate
	pip3 install -r python/requirements.txt #--quiet

	- name: Setup MonetDB
	if: env.SKIP_MONETDB == 'false'
	run: \|
	pushd MonetDB
	mkdir rel
	pushd rel
	cmake -DCMAKE_INSTALL_PREFIX=~/monetdb_bin/ -DASSERT=OFF -DCMAKE_BUILD_TYPE=Release .. 1> /dev/null
	cmake --build . --target install
	echo "${HOME}/monetdb_bin/bin" >> $GITHUB_PATH
	popd
	popd

	- name: Generate TPC-H data set (MonetDB)
	if: env.SKIP_MONETDB == 'false'
	run: \|
	mkdir -p monetdb_farm
	pushd tpch-scripts
	./tpch_build.sh -s ${{ env.SCALE_FACTOR }} -f ~/monetdb_farm
	popd

	- name: Benchmark MonetDB (database comparison)
	if: env.SKIP_MONETDB == 'false'
	run: \|
	pushd python
	source ~/venv/bin/activate
	python3 db_comparison_runner.py monetdb --cores=${{ env.CORE_COUNT }} --scale_factor=${{ env.SCALE_FACTOR }} --clients=${{ env.CLIENT_COUNT }} --time=${{ env.COMPARISON_RUNTIME }}
	python3 db_comparison_runner.py monetdb --cores=${{ env.CORE_COUNT }} --scale_factor=${{ env.SCALE_FACTOR }} --determine_size_only
	popd

	- name: Generate TPC-H data set (for DuckDB and Umbra)
	if: env.SKIP_DUCKDB == 'false'
	run: \|
	pushd tpch-dbgen
	make &> /dev/null
	./dbgen -s ${{ env.SCALE_FACTOR }} -f
	mkdir -p sf${{ env.SCALE_FACTOR }}
	mv *.tbl sf${{ env.SCALE_FACTOR }}
	popd
	mv tpch-dbgen ~


	- name: Benchmark DuckDB (database comparison)
	if: env.SKIP_DUCKDB == 'false'
	run: \|
	pushd python
	source ~/venv/bin/activate
	python3 db_comparison_runner.py duckdb --cores=${{ env.CORE_COUNT }} --scale_factor=${{ env.SCALE_FACTOR }} --clients=${{ env.CLIENT_COUNT }} --time=${{ env.COMPARISON_RUNTIME }}
	python3 db_comparison_runner.py duckdb --cores=${{ env.CORE_COUNT }} --scale_factor=${{ env.SCALE_FACTOR }} --determine_size_only
	popd

	- name: Upload benchmark results
	uses: actions/upload-artifact@master
	if: env.SKIP_DUCKDB == 'false' \|\| env.SKIP_MONETDB == 'false'
	with:
	name: comparison_results
	path: \|
	python/db_comparison_results/*.csv


	plotting:
	needs: [hyrise_full_pipeline, database_comparison]
	name: Plotting
	runs-on: ubuntu-24.04
	steps:
	- uses: actions/checkout@master
	- uses: r-lib/actions/setup-r@v2

	- uses: actions/download-artifact@master
	with:
	name: comparison_results
	path: results_to_plot

	- uses: actions/download-artifact@master
	with:
	name: comparison_results_hyrise_non-constrained
	path: results_to_plot

	- uses: actions/download-artifact@master
	with:
	name: comparison_results_hyrise_budget-constrained
	path: results_to_plot

	- name: Set environment variables
	run: \|
	echo "HYRISE_CORE_COUNT=${{ needs.hyrise_full_pipeline.outputs.core_count }}" >> $GITHUB_ENV
	echo "HYRISE_CLIENT_COUNT=${{ needs.hyrise_full_pipeline.outputs.client_count }}" >> $GITHUB_ENV
	echo "CALIBRATION_RUN=${{ needs.hyrise_full_pipeline.outputs.calibration_run }}" >> $GITHUB_ENV

	echo "COMPARISON_CORE_COUNT=${{ needs.database_comparison.outputs.core_count }}" >> $GITHUB_ENV
	echo "COMPARISON_CLIENT_COUNT=${{ needs.database_comparison.outputs.client_count }}" >> $GITHUB_ENV

	# Install R packages (install action did not work with act)
	- name: Install dependencies
	run: \|
	install.packages(c(
	"dplyr",
	"ggplot2",
	"ggrepel"
	))
	shell: Rscript {0}

	- name: Plot
	run: \|
	source("R/plot.R")
	shell: Rscript {0}

	- name: Upload database comparison plot
	uses: actions/upload-artifact@master
	with:
	name: database_comparison
	path: \|
	db_comparison.pdf

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Update haupt.yml #152

Workflow file

Update haupt.yml #152

Jobs

Run details

Workflow file for this run