Skip to content

Commit

Permalink
[GHA] replay-verify replays at state snapshot versions
Browse files Browse the repository at this point in the history
so that no work is wasted

1. added `gen-replay-verify-jobs` sub-command to the aptos-debugger, to
   generate txn ranges that begins at state snapshots and of desired
   size in number of transactions. If there's too many txns between two
   adjacent snapshots, the range is trancated to the target size. This
   way we deal with load tests Tapos kind of situation automatically.
2. Each job runs only one replay -- there's no longer "partitions".
   Instead, we issue a lot more jobs with concurrency control. This way
   jobs run in "waves" and "load balancing" is automatically achieved.
3. A single "prepare" job does the building and jobs generation, and the
   actual replay jobs don't need to build the binary, etc.

max-parallel: 100

use less machines
  • Loading branch information
msmouse committed Sep 20, 2024
1 parent ce6158a commit ff148d3
Show file tree
Hide file tree
Showing 9 changed files with 388 additions and 31 deletions.
1 change: 1 addition & 0 deletions .github/actions/rust-setup/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ runs:
# rust-cache action will cache ~/.cargo and ./target
# https://github.com/Swatinem/rust-cache#cache-details
- name: Run cargo cache
if: !startsWith(github.ref, 'refs/pull/')
uses: Swatinem/rust-cache@359a70e43a0bb8a13953b04a90f76428b4959bb6 # [email protected]
with:
key: ${{ inputs.ADDITIONAL_KEY }}
Expand Down
31 changes: 23 additions & 8 deletions .github/workflows/replay-verify.yaml

Large diffs are not rendered by default.

142 changes: 122 additions & 20 deletions .github/workflows/workflow-run-replay-verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ on:
required: false
type: string
description: The list of transaction versions to skip. If not specified, it will use the default list.
RANGES_TO_SKIP:
required: false
type: string
description: The optional list of transaction ranges to skip..
BACKUP_CONFIG_TEMPLATE_PATH:
description: "The path to the backup config template to use."
type: string
Expand All @@ -39,7 +43,7 @@ on:
description: "Github job timeout in minutes"
type: number
required: true
default: 720
default: 180
# This allows the workflow to be triggered manually from the Github UI or CLI
# NOTE: because the "number" type is not supported, we default to 720 minute timeout
workflow_dispatch:
Expand All @@ -65,6 +69,10 @@ on:
required: false
type: string
description: The list of transaction versions to skip. If not specified, it will use the default list.
RANGES_TO_SKIP:
required: false
type: string
description: The optional list of transaction ranges to skip..
BACKUP_CONFIG_TEMPLATE_PATH:
description: "The path to the backup config template to use."
type: string
Expand All @@ -77,40 +85,134 @@ on:
default: "high-perf-docker-with-local-ssd"

jobs:
prepare:
runs-on: "runs-on,cpu=64,family=c7,hdd=500,image=aptos-ubuntu-x64,spot=false"
outputs:
jobs_ids: ${{ steps.gen-jobs.outputs.job_ids }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.GIT_SHA }}

- name: Load cached aptos-debugger binary
id: cache-aptos-debugger-binary
uses: actions/cache@v4
with:
# copy the binary to the root of the repo and cache it there, because rust-setup calls a cache-rust action
# which cleans up the target directory in its post action
path: |
aptos-debugger
testsuite/replay_verify.py
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}

- name: Prepare for build if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
with:
GIT_CREDENTIALS: ${{ inputs.GIT_CREDENTIALS }}

- name: Build and strip aptos-debugger binary if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
shell: bash
run: |
cargo build --release -p aptos-debugger
strip -s target/release/aptos-debugger
cp target/release/aptos-debugger .
- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: get timestamp to use in cache key
id: get-timestamp
run: echo "ts=$(date +%s)" >> $GITHUB_OUTPUT

- name: Load cached backup storage metadata cache dir (and save back afterwards)
uses: actions/cache@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ steps.get-timestamp.outputs.ts }}
restore-keys: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-

- name: Generate job ranges
id: gen-jobs
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
run: |
./aptos-debugger aptos-db gen-replay-verify-jobs \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} \
--start-version ${{ inputs.HISTORY_START }} \
--ranges-to-skip "${{ inputs.RANGES_TO_SKIP }}" \
\
--max-ranges-per-job 16 \
--output-json-file jobs.json \
JOB_IDS=`jq 'length as $N | [range(0; $N)]' jobs.json`
echo "jobs_ids=$JOB_IDS" >> $GITHUB_OUTPUT
- name: Cache backup storage config and job definition
uses: actions/cache/save@v4
with:
path: |
${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
jobs.json
key: backup-config-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ github.run_id }}

replay-verify:
timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 720 }}
needs: prepare
timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 180 }}
runs-on: ${{ inputs.RUNS_ON }}
strategy:
fail-fast: false
matrix:
number: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18] # runner number
job_id: ${{ fromJson(steps.prepare.outputs.job_ids) }}
steps:
- name: Echo Runner Number
run: echo "Runner is ${{ matrix.number }}"
- uses: actions/checkout@v4
- name: Load cached aptos-debugger binary and replay_verify.py script
uses: actions/cache/restore@v4
with:
ref: ${{ inputs.GIT_SHA }}
path: |
aptos-debugger
testsuite/replay_verify.py
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}
fail-on-cache-miss: true

- uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
- name: Load cached backup storage metadata cache dir
uses: actions/cache/restore@v4
with:
GIT_CREDENTIALS: ${{ secrets.GIT_CREDENTIALS }}
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-
fail-on-cache-miss: true

- name: Load cached backup storage config and job definitions
uses: actions/cache/restore@v4
with:
path: |
${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
jobs.json
key: backup-config-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ github.run_id }}
fail-on-cache-miss: true

- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: Build CLI binaries in release mode
shell: bash
run: cargo build --release -p aptos-debugger

- name: Run replay-verify in parallel
shell: bash
run: testsuite/replay_verify.py ${{ matrix.number }} 19 # first argument is the runner number, second argument is the total number of runners
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
HISTORY_START: ${{ inputs.HISTORY_START }}
TXNS_TO_SKIP: ${{ inputs.TXNS_TO_SKIP }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
run: |
# extract job by job_id
jq '.[${{ matrix.job_id }}].[]' jobs.json | while read _desc begin end msg; do
echo $begin-$end: $msg
sleep 30&
done
echo "start waiting"
wait
echo "done waiting"
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 23 additions & 3 deletions execution/executor/src/chunk_executor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ use std::{
atomic::{AtomicBool, Ordering},
Arc,
},
time::Instant,
};

pub static SIG_VERIFY_POOL: Lazy<Arc<rayon::ThreadPool>> = Lazy::new(|| {
Expand Down Expand Up @@ -598,9 +599,11 @@ impl<V: VMExecutor> TransactionReplayer for ChunkExecutorInner<V> {
mut event_vecs: Vec<Vec<ContractEvent>>,
verify_execution_mode: &VerifyExecutionMode,
) -> Result<()> {
let started = Instant::now();
let num_txns = transactions.len();
let mut latest_view = self.commit_queue.lock().expect_latest_view()?;
let chunk_begin = latest_view.num_transactions() as Version;
let chunk_end = chunk_begin + transactions.len() as Version; // right-exclusive
let chunk_end = chunk_begin + num_txns as Version; // right-exclusive

// Find epoch boundaries.
let mut epochs = Vec::new();
Expand Down Expand Up @@ -636,11 +639,28 @@ impl<V: VMExecutor> TransactionReplayer for ChunkExecutorInner<V> {

self.commit_queue
.lock()
.enqueue_chunk_to_commit_directly(executed_chunk.expect("Nothing to commit."))
.enqueue_chunk_to_commit_directly(executed_chunk.expect("Nothing to commit."))?;
info!(
num_txns = num_txns,
tps = (num_txns as f64 / started.elapsed().as_secs_f64()),
"TransactionReplayer::replay() OK"
);

Ok(())
}

fn commit(&self) -> Result<ExecutedChunk> {
self.commit_chunk_impl()
let started = Instant::now();

let chunk = self.commit_chunk_impl()?;

let num_committed = chunk.transactions_to_commit().len();
info!(
num_committed = num_committed,
tps = num_committed as f64 / started.elapsed().as_secs_f64(),
"TransactionReplayer::commit() OK"
);
Ok(chunk)
}
}

Expand Down
4 changes: 4 additions & 0 deletions storage/backup/backup-cli/src/metadata/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ impl MetadataView {
self.compaction_timestamps.clone()
}

pub fn all_state_snapshots(&self) -> &[StateSnapshotBackupMeta] {
&self.state_snapshot_backups
}

pub fn select_state_snapshot(
&self,
target_version: Version,
Expand Down
1 change: 1 addition & 0 deletions storage/db-tool/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ aptos-vm = { workspace = true }
bcs = { workspace = true }
clap = { workspace = true }
itertools = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }

[dev-dependencies]
Expand Down
Loading

0 comments on commit ff148d3

Please sign in to comment.