Skip to content

Commit

Permalink
Update the CI pipeline to allow for concurrent agents on one machine
Browse files Browse the repository at this point in the history
Start using the Buildkit docker plugin, to handle containers better.
No more killing stale containers, which prevents jobs from running concurrently.
  • Loading branch information
nibty committed Dec 15, 2024
1 parent 199420b commit 9da3356
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 54 deletions.
34 changes: 0 additions & 34 deletions .buildkite/hooks/post-checkout
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,3 @@ CI_BUILD_START=$(date +%s)
export CI_BUILD_START

source ci/env.sh

#
# Kill any running docker containers, which are potentially left over from the
# previous CI job
#
(
echo "+++ Killing stale docker containers"
while read -r line; do
read -r id image _ <<<"$line"

if [[ $image =~ "solanalabs/rust" ]]; then
if docker kill "$id" >/dev/null; then
echo "kill $id $image"
fi
continue
fi
done < <(docker ps | tail -n +2)
)

# Processes from previously aborted CI jobs seem to loiter, unclear why as one
# would expect the buildkite-agent to clean up all child processes of the
# aborted CI job.
# But as a workaround for now manually kill some known loiterers. These
# processes will all have the `init` process as their PPID:
(
victims=
for name in bash cargo docker solana; do
victims="$victims $(pgrep -u "$(id -u)" -P 1 -d \ $name)"
done
for victim in $victims; do
echo "Killing pid $victim"
kill -9 "$victim" || true
done
)
6 changes: 3 additions & 3 deletions .buildkite/scripts/build-bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ build_steps() {
{
"name": "$1",
"command": "$2",
"timeout_in_minutes": 30,
"timeout_in_minutes": 60,
"agent": "$agent",
"retry": 3
}
Expand All @@ -22,5 +22,5 @@ EOF

# shellcheck disable=SC2016
group "bench" \
"$(build_steps "bench-part-1" ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/bench/part1.sh")" \
"$(build_steps "bench-part-2" ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/bench/part2.sh")"
"$(build_steps "bench-part-1" "ci/bench/part1.sh")" \
"$(build_steps "bench-part-2" "ci/bench/part2.sh")"
6 changes: 3 additions & 3 deletions .buildkite/scripts/build-stable.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ partitions=$(
cat <<EOF
{
"name": "partitions",
"command": ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_stable_docker_image ci/stable/run-partition.sh",
"command": "ci/stable/run-partition.sh",
"timeout_in_minutes": 30,
"agent": "$agent",
"parallelism": 2,
Expand All @@ -25,7 +25,7 @@ local_cluster_partitions=$(
cat <<EOF
{
"name": "local-cluster",
"command": ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_stable_docker_image ci/stable/run-local-cluster-partially.sh",
"command": "ci/stable/run-local-cluster-partially.sh",
"timeout_in_minutes": 30,
"agent": "$agent",
"parallelism": 5,
Expand All @@ -38,7 +38,7 @@ localnet=$(
cat <<EOF
{
"name": "localnet",
"command": ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_stable_docker_image ci/stable/run-localnet.sh",
"command": "ci/stable/run-localnet.sh",
"timeout_in_minutes": 30,
"agent": "$agent"
}
Expand Down
16 changes: 16 additions & 0 deletions .buildkite/scripts/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

export INDENT_LEVEL=2

. ci/rust-version.sh

indent() {
local indent=${1:-"$INDENT_LEVEL"}
sed "s/^/$(printf ' %.0s' $(seq 1 "$indent"))/"
Expand Down Expand Up @@ -69,6 +71,20 @@ EOF
cat <<EOF | indent | sed '/DELETE_THIS_LINE/d'
- name: "$name"
command: "$command"
plugins:
- docker#v5.12.0:
image: "$rust_nightly_docker_image"
workdir: /solana
propagate-environment: true
propagate-uid-gid: true
environment:
- "RUSTC_WRAPPER=/usr/local/cargo/bin/sccache"
- BUILDKITE_AGENT_ACCESS_TOKEN
- AWS_SECRET_ACCESS_KEY
- AWS_ACCESS_KEY_ID
- SCCACHE_BUCKET
- SCCACHE_REGION
- SCCACHE_S3_KEY_PREFIX
timeout_in_minutes: $timeout_in_minutes
agents:
queue: "$agent"
Expand Down
70 changes: 57 additions & 13 deletions ci/buildkite-pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,22 @@
set -e
cd "$(dirname "$0")"/..

source ci/env.sh

output_file=${1:-/dev/stderr}

if [[ -n $CI_PULL_REQUEST ]]; then
# filter pr number from ci branch.
[[ $CI_BRANCH =~ pull/([0-9]+)/head ]]
pr_number=${BASH_REMATCH[1]}
echo "get affected files from PR: $pr_number"
if [[ -n $BUILDKITE_PULL_REQUEST ]]; then
pr_number=$BUILDKITE_PULL_REQUEST
else
# filter pr number from ci branch.
[[ $CI_BRANCH =~ pull/([0-9]+)/head ]]
pr_number=${BASH_REMATCH[1]}
fi
echo "get affected files from PR: $pr_number"

# get affected files
readarray -t affected_files < <(gh pr diff --name-only "$pr_number")
readarray -t affected_files < <(GH_TOKEN="$(buildkite-agent secret get GH_TOKEN)" gh pr diff --name-only "$pr_number")
if [[ ${#affected_files[*]} -eq 0 ]]; then
echo "Unable to determine the files affected by this PR"
exit 1
Expand Down Expand Up @@ -118,6 +124,30 @@ command_step() {
EOF
}

docker_command_step() {
cat >> "$output_file" <<EOF
- name: "$1"
command: "$2"
plugins:
- docker#v5.12.0:
image: "$3"
workdir: /solana
propagate-environment: true
propagate-uid-gid: true
environment:
- "RUSTC_WRAPPER=/usr/local/cargo/bin/sccache"
- BUILDKITE_AGENT_ACCESS_TOKEN
- AWS_SECRET_ACCESS_KEY
- AWS_ACCESS_KEY_ID
- SCCACHE_BUCKET
- SCCACHE_REGION
- SCCACHE_S3_KEY_PREFIX
timeout_in_minutes: $4
artifact_paths: "log-*.txt"
agents:
queue: "${5:-solana}"
EOF
}

trigger_secondary_step() {
cat >> "$output_file" <<"EOF"
Expand All @@ -140,9 +170,10 @@ wait_step() {
}

all_test_steps() {
command_step checks1 ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-checks.sh" 20 check
command_step checks2 ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-dev-context-only-utils.sh check-bins" 15 check
command_step checks3 ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-dev-context-only-utils.sh check-all-targets" 15 check
. ci/rust-version.sh
docker_command_step checks1 "ci/test-checks.sh" $rust_nightly_docker_image 20 check
docker_command_step checks2 "ci/test-dev-context-only-utils.sh check-bins" $rust_nightly_docker_image 20 check
docker_command_step checks3 "ci/test-dev-context-only-utils.sh check-all-targets" $rust_nightly_docker_image 20 check
wait_step

# Full test suite
Expand All @@ -156,7 +187,7 @@ all_test_steps() {
^ci/rust-version.sh \
^ci/test-docs.sh \
; then
command_step doctest ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_stable_docker_image ci/test-docs.sh" 15
docker_command_step doctest "ci/test-docs.sh" $rust_nightly_docker_image 15
else
annotate --style info --context test-docs \
"Docs skipped as no .rs files were modified"
Expand All @@ -182,7 +213,20 @@ all_test_steps() {
cargo-test-sbf$ \
; then
cat >> "$output_file" <<"EOF"
- command: ". ci/rust-version.sh; ci/docker-run.sh $$rust_stable_docker_image ci/test-stable-sbf.sh"
- command: "ci/test-stable-sbf.sh"
plugins:
- docker#v5.12.0:
image: "$rust_nightly_docker_image"
workdir: /solana
propagate-environment: true
propagate-uid-gid: true
environment:
- "RUSTC_WRAPPER=/usr/local/cargo/bin/sccache"
- AWS_SECRET_ACCESS_KEY
- AWS_ACCESS_KEY_ID
- SCCACHE_BUCKET
- SCCACHE_REGION
- SCCACHE_S3_KEY_PREFIX
name: "stable-sbf"
timeout_in_minutes: 35
artifact_paths: "sbf-dumps.tar.bz2"
Expand Down Expand Up @@ -226,7 +270,7 @@ EOF
^ci/test-stable.sh \
^sdk/ \
; then
command_step wasm ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_stable_docker_image ci/test-wasm.sh" 20
docker_command_step wasm "ci/test-wasm.sh" $rust_nightly_docker_image 20
else
annotate --style info \
"wasm skipped as no relevant files were modified"
Expand Down Expand Up @@ -258,7 +302,7 @@ EOF
^ci/test-coverage.sh \
^scripts/coverage.sh \
; then
command_step coverage ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-coverage.sh" 80
docker_command_step coverage "ci/test-coverage.sh" $rust_nightly_docker_image 80
else
annotate --style info --context test-coverage \
"Coverage skipped as no .rs files were modified"
Expand Down Expand Up @@ -296,7 +340,7 @@ pull_or_push_steps() {

if [ -z "$diff_other_than_version_bump" ]; then
echo "Diff only contains version bump."
command_step checks ". ci/rust-version.sh; ci/docker-run.sh \$\$rust_nightly_docker_image ci/test-checks.sh" 20
docker_command_step checks "ci/test-checks.sh" $rust_nightly_docker_image 20
exit 0
fi
fi
Expand Down
10 changes: 9 additions & 1 deletion ci/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ if [[ -n $CI ]]; then
export CI_REPO_SLUG=$TRAVIS_REPO_SLUG
export CI_TAG=$TRAVIS_TAG
elif [[ -n $BUILDKITE ]]; then
AWS_ACCESS_KEY_ID="$(buildkite-agent secret get AWS_ACCESS_KEY_ID)"
AWS_SECRET_ACCESS_KEY="$(buildkite-agent secret get AWS_SECRET_ACCESS_KEY)"
SCCACHE_BUCKET="$(buildkite-agent secret get SCCACHE_BUCKET)"
SCCACHE_REGION="$(buildkite-agent secret get SCCACHE_REGION)"
export AWS_ACCESS_KEY_ID
export AWS_SECRET_ACCESS_KEY
export SCCACHE_BUCKET
export SCCACHE_REGION
export CI_BRANCH=$BUILDKITE_BRANCH
export CI_BUILD_ID=$BUILDKITE_BUILD_ID
if [[ $BUILDKITE_COMMIT = HEAD ]]; then
Expand All @@ -31,7 +39,7 @@ if [[ -n $CI ]]; then
# The standard BUILDKITE_PULL_REQUEST environment variable is always "false" due
# to how solana-ci-gate is used to trigger PR builds rather than using the
# standard Buildkite PR trigger.
if [[ $CI_BRANCH =~ pull/* ]]; then
if [[ $CI_BRANCH =~ pull/* ]] || [[ -n $BUILDKITE_PULL_REQUEST_BASE_BRANCH ]]; then
export CI_BASE_BRANCH=$BUILDKITE_PULL_REQUEST_BASE_BRANCH
export CI_PULL_REQUEST=true
else
Expand Down

0 comments on commit 9da3356

Please sign in to comment.