Run benchmarks #1

Workflow file for this run

.github/workflows/benchmark.yml at fa8f4b1

	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#

	name: Run benchmarks

	on:
	workflow_dispatch:
	inputs:
	class:
	description: 'Benchmark class'
	required: true
	default: '*'
	jdk:
	description: 'JDK version: 17 or 21'
	required: true
	default: '17'
	scala:
	description: 'Scala version: 2.13'
	required: true
	default: '2.13'
	failfast:
	description: 'Failfast: true or false'
	required: true
	default: 'true'
	num-splits:
	description: 'Number of job splits'
	required: true
	default: '1'

	jobs:
	matrix-gen:
	name: Generate matrix for job splits
	runs-on: ubuntu-20.04
	outputs:
	matrix: ${{ steps.set-matrix.outputs.matrix }}
	env:
	SPARK_BENCHMARK_NUM_SPLITS: ${{ github.event.inputs.num-splits }}
	steps:
	- name: Generate matrix
	id: set-matrix
	run: echo "matrix=["`seq -s, 1 $SPARK_BENCHMARK_NUM_SPLITS`"]" >> $GITHUB_OUTPUT

	# Any TPC-DS related updates on this job need to be applied to tpcds-1g job of build_and_test.yml as well
	tpcds-1g-gen:
	name: "Generate an input dataset for TPCDSQueryBenchmark with SF=1"
	if: contains(github.event.inputs.class, 'TPCDSQueryBenchmark') \|\| contains(github.event.inputs.class, '*')
	runs-on: ubuntu-20.04
	env:
	SPARK_LOCAL_IP: localhost
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v3
	# In order to get diff files
	with:
	fetch-depth: 0
	- name: Cache Scala, SBT and Maven
	uses: actions/cache@v3
	with:
	path: \|
	build/apache-maven-*
	build/scala-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v3
	with:
	path: ~/.cache/coursier
	key: benchmark-coursier-${{ github.event.inputs.jdk }}-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	benchmark-coursier-${{ github.event.inputs.jdk }}
	- name: Cache TPC-DS generated data
	id: cache-tpcds-sf-1
	uses: actions/cache@v3
	with:
	path: ./tpcds-sf-1
	key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }}
	- name: Checkout tpcds-kit repository
	if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
	uses: actions/checkout@v3
	with:
	repository: databricks/tpcds-kit
	ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069
	path: ./tpcds-kit
	- name: Build tpcds-kit
	if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
	run: cd tpcds-kit/tools && make OS=LINUX
	- name: Install Java ${{ github.event.inputs.jdk }}
	if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
	uses: actions/setup-java@v3
	with:
	distribution: zulu
	java-version: ${{ github.event.inputs.jdk }}
	- name: Generate TPC-DS (SF=1) table data
	if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
	run: build/sbt "sql/Test/runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite"

	benchmark:
	name: "Run benchmarks: ${{ github.event.inputs.class }} (JDK ${{ github.event.inputs.jdk }}, Scala ${{ github.event.inputs.scala }}, ${{ matrix.split }} out of ${{ github.event.inputs.num-splits }} splits)"
	if: always()
	needs: [matrix-gen, tpcds-1g-gen]
	# Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
	runs-on: ubuntu-20.04
	strategy:
	fail-fast: false
	matrix:
	split: ${{fromJSON(needs.matrix-gen.outputs.matrix)}}
	env:
	SPARK_BENCHMARK_FAILFAST: ${{ github.event.inputs.failfast }}
	SPARK_BENCHMARK_NUM_SPLITS: ${{ github.event.inputs.num-splits }}
	SPARK_BENCHMARK_CUR_SPLIT: ${{ matrix.split }}
	SPARK_GENERATE_BENCHMARK_FILES: 1
	SPARK_LOCAL_IP: localhost
	# To prevent spark.test.home not being set. See more detail in SPARK-36007.
	SPARK_HOME: ${{ github.workspace }}
	SPARK_TPCDS_DATA: ${{ github.workspace }}/tpcds-sf-1
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v3
	# In order to get diff files
	with:
	fetch-depth: 0
	- name: Cache Scala, SBT and Maven
	uses: actions/cache@v3
	with:
	path: \|
	build/apache-maven-*
	build/scala-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v3
	with:
	path: ~/.cache/coursier
	key: benchmark-coursier-${{ github.event.inputs.jdk }}-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	benchmark-coursier-${{ github.event.inputs.jdk }}
	- name: Install Java ${{ github.event.inputs.jdk }}
	uses: actions/setup-java@v3
	with:
	distribution: zulu
	java-version: ${{ github.event.inputs.jdk }}
	- name: Cache TPC-DS generated data
	if: contains(github.event.inputs.class, 'TPCDSQueryBenchmark') \|\| contains(github.event.inputs.class, '*')
	id: cache-tpcds-sf-1
	uses: actions/cache@v3
	with:
	path: ./tpcds-sf-1
	key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }}
	- name: Run benchmarks
	run: \|
	./build/sbt -Pscala-${{ github.event.inputs.scala }} -Pyarn -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pspark-ganglia-lgpl Test/package
	# Make less noisy
	cp conf/log4j2.properties.template conf/log4j2.properties
	sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties
	# In benchmark, we use local as master so set driver memory only. Note that GitHub Actions has 7 GB memory limit.
	bin/spark-submit \
	--driver-memory 6g --class org.apache.spark.benchmark.Benchmarks \
	--jars "`find . -name '-SNAPSHOT-tests.jar' -o -name 'avro*-SNAPSHOT.jar' \| paste -sd ',' -`" \
	"`find . -name 'spark-core*-SNAPSHOT-tests.jar'`" \
	"${{ github.event.inputs.class }}"
	# To keep the directory structure and file permissions, tar them
	# See also https://github.com/actions/upload-artifact#maintaining-file-permissions-and-case-sensitive-files
	echo "Preparing the benchmark results:"
	tar -cvf benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}.tar `git diff --name-only` `git ls-files --others --exclude=tpcds-sf-1 --exclude-standard`
	- name: Upload benchmark results
	uses: actions/upload-artifact@v3
	with:
	name: benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}-${{ matrix.split }}
	path: benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}.tar

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Run benchmarks #1

Workflow file

Run benchmarks #1

Jobs

Run details

Workflow file for this run