From b5490a04d3b73e520de9bdae0d132fa35a885665 Mon Sep 17 00:00:00 2001
From: Alex Ostrovski <slowli@users.noreply.github.com>
Date: Mon, 28 Oct 2024 18:31:35 +0200
Subject: [PATCH] test(vm): Improve instruction-counting VM benchmark (#3105)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What ❔

Replaces `iai` with an alternative; brushes up instruction counting in
general.

## Why ❔

- The library currently used for the benchmark (`iai`) is unmaintained.
- It doesn't work with newer valgrind versions.
- It doesn't allow measuring parts of program execution, only the entire
program run.

## Checklist

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [x] Tests for the changes have been added / updated.
- [x] Documentation comments have been added / updated.
- [x] Code has been formatted via `zkstack dev fmt` and `zkstack dev
lint`.
---
 .github/workflows/vm-perf-comparison.yml      |  33 ++-
 .github/workflows/vm-perf-to-prometheus.yml   |   4 +-
 Cargo.lock                                    |  57 +++--
 Cargo.toml                                    |   4 +-
 core/lib/multivm/src/versions/vm_fast/mod.rs  |   2 +-
 core/lib/vm_executor/src/batch/factory.rs     |   2 +-
 core/tests/vm-benchmark/Cargo.toml            |   4 +-
 core/tests/vm-benchmark/benches/iai.rs        |  35 ---
 .../vm-benchmark/benches/instructions.rs      | 206 ++++++++++++++++++
 core/tests/vm-benchmark/src/bin/common/mod.rs |  54 -----
 .../src/bin/compare_iai_results.rs            | 108 ---------
 .../src/bin/iai_results_to_prometheus.rs      |  52 -----
 .../src/bin/instruction_counts.rs             | 106 ++++++++-
 core/tests/vm-benchmark/src/criterion.rs      |   6 +-
 core/tests/vm-benchmark/src/lib.rs            |   2 +-
 core/tests/vm-benchmark/src/vm.rs             | 114 ++++++----
 16 files changed, 446 insertions(+), 343 deletions(-)
 delete mode 100644 core/tests/vm-benchmark/benches/iai.rs
 create mode 100644 core/tests/vm-benchmark/benches/instructions.rs
 delete mode 100644 core/tests/vm-benchmark/src/bin/common/mod.rs
 delete mode 100644 core/tests/vm-benchmark/src/bin/compare_iai_results.rs
 delete mode 100644 core/tests/vm-benchmark/src/bin/iai_results_to_prometheus.rs

diff --git a/.github/workflows/vm-perf-comparison.yml b/.github/workflows/vm-perf-comparison.yml
index 49830a30cc1e..3520419f1337 100644
--- a/.github/workflows/vm-perf-comparison.yml
+++ b/.github/workflows/vm-perf-comparison.yml
@@ -40,6 +40,8 @@ jobs:
           echo "SCCACHE_GCS_SERVICE_ACCOUNT=gha-ci-runners@matterlabs-infra.iam.gserviceaccount.com" >> .env
           echo "SCCACHE_GCS_RW_MODE=READ_WRITE" >> .env
           echo "RUSTC_WRAPPER=sccache" >> .env
+          # Set the minimum reported instruction count difference to reduce noise
+          echo "BENCHMARK_DIFF_THRESHOLD_PERCENT=2" >> .env
 
       - name: init
         run: |
@@ -51,8 +53,8 @@ jobs:
         run: |
           ci_run zkstackup -g --local
           ci_run zkstack dev contracts --system-contracts
-          ci_run cargo bench --package vm-benchmark --bench iai | tee base-iai
-          ci_run cargo run --package vm-benchmark --release --bin instruction_counts | tee base-opcodes || touch base-opcodes
+          ci_run cargo bench --package vm-benchmark --bench instructions -- --verbose || echo "Instructions benchmark is missing"
+          ci_run cargo run --package vm-benchmark --release --bin instruction_counts | tee base-opcodes
 
       - name: checkout PR
         run: |
@@ -60,24 +62,39 @@ jobs:
 
       - name: run benchmarks on PR
         shell: bash
+        id: comparison
         run: |
           ci_run zkstackup -g --local
           ci_run zkstack dev contracts --system-contracts
-          ci_run cargo bench --package vm-benchmark --bench iai | tee pr-iai
-          ci_run cargo run --package vm-benchmark --release --bin instruction_counts | tee pr-opcodes || touch pr-opcodes
+          ci_run cargo bench --package vm-benchmark --bench instructions -- --verbose
 
+          ci_run cargo bench --package vm-benchmark --bench instructions -- --print > instructions.log 2>/dev/null
+          # Output all lines from the benchmark result starting from the "## ..." comparison header.
+          # Since the output spans multiple lines, we use a heredoc declaration.
           EOF=$(dd if=/dev/urandom bs=15 count=1 status=none | base64)
           echo "speedup<<$EOF" >> $GITHUB_OUTPUT
-          ci_run cargo run --package vm-benchmark --release --bin compare_iai_results base-iai pr-iai base-opcodes pr-opcodes >> $GITHUB_OUTPUT
+          sed -n '/^## /,$p' instructions.log >> $GITHUB_OUTPUT
+          echo "$EOF" >> $GITHUB_OUTPUT
+          
+          ci_run cargo run --package vm-benchmark --release --bin instruction_counts -- --diff base-opcodes > opcodes.log
+          echo "opcodes<<$EOF" >> $GITHUB_OUTPUT
+          sed -n '/^## /,$p' opcodes.log >> $GITHUB_OUTPUT
           echo "$EOF" >> $GITHUB_OUTPUT
-        id: comparison
 
       - name: Comment on PR
         uses: thollander/actions-comment-pull-request@fabd468d3a1a0b97feee5f6b9e499eab0dd903f6 # v2.5.0
+        if: steps.comparison.outputs.speedup != '' || steps.comparison.outputs.opcodes != ''
         with:
           message: |
-            ${{ steps.comparison.outputs.speedup == '' && '## No performance difference detected (anymore)' || '## Detected VM performance changes' }}
             ${{ steps.comparison.outputs.speedup }}
+            ${{ steps.comparison.outputs.opcodes }}
           comment_tag: vm-performance-changes
           mode: recreate
-          create_if_not_exists: ${{ steps.comparison.outputs.speedup != '' }}
+          create_if_not_exists: true
+      - name: Remove PR comment
+        uses: thollander/actions-comment-pull-request@fabd468d3a1a0b97feee5f6b9e499eab0dd903f6 # v2.5.0
+        if: steps.comparison.outputs.speedup == '' && steps.comparison.outputs.opcodes == ''
+        with:
+          comment_tag: vm-performance-changes
+          message: 'No performance difference detected (anymore)'
+          mode: delete
diff --git a/.github/workflows/vm-perf-to-prometheus.yml b/.github/workflows/vm-perf-to-prometheus.yml
index d336a1472e4a..93d33116794f 100644
--- a/.github/workflows/vm-perf-to-prometheus.yml
+++ b/.github/workflows/vm-perf-to-prometheus.yml
@@ -48,5 +48,5 @@ jobs:
           ci_run cargo bench --package vm-benchmark --bench oneshot
           # Run only benches with 1,000 transactions per batch to not spend too much time
           ci_run cargo bench --package vm-benchmark --bench batch '/1000$'
-          ci_run cargo bench --package vm-benchmark --bench iai | tee iai-result
-          ci_run cargo run --package vm-benchmark --bin iai_results_to_prometheus --release < iai-result
+          ci_run cargo bench --package vm-benchmark --bench instructions -- --verbose
+          ci_run cargo bench --package vm-benchmark --bench instructions -- --print
diff --git a/Cargo.lock b/Cargo.lock
index 2e3a8a1c3e3a..597da3c1b31b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -959,7 +959,7 @@ name = "block_reverter"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.5.18",
+ "clap 4.5.20",
  "serde_json",
  "tokio",
  "zksync_block_reverter",
@@ -1445,9 +1445,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.18"
+version = "4.5.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0956a43b323ac1afaffc053ed5c4b7c1f1800bacd1683c353aabbb752515dd3"
+checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -1455,14 +1455,15 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.18"
+version = "4.5.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4d72166dd41634086d5803a47eb71ae740e61d84709c36f3c34110173db3961b"
+checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54"
 dependencies = [
  "anstream",
  "anstyle",
  "clap_lex 0.7.2",
  "strsim 0.11.1",
+ "terminal_size",
 ]
 
 [[package]]
@@ -2796,7 +2797,7 @@ name = "genesis_generator"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.5.18",
+ "clap 4.5.20",
  "futures 0.3.30",
  "serde",
  "serde_json",
@@ -3472,12 +3473,6 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "iai"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71a816c97c42258aa5834d07590b718b4c9a598944cd39a52dc25b351185d678"
-
 [[package]]
 name = "iana-time-zone"
 version = "0.1.61"
@@ -4122,7 +4117,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
 dependencies = [
  "cfg-if",
- "windows-targets 0.52.6",
+ "windows-targets 0.48.5",
 ]
 
 [[package]]
@@ -4392,7 +4387,7 @@ name = "merkle_tree_consistency_checker"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.5.18",
+ "clap 4.5.20",
  "tracing",
  "zksync_config",
  "zksync_env_config",
@@ -5498,7 +5493,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
 dependencies = [
  "bytes",
- "heck 0.5.0",
+ "heck 0.4.1",
  "itertools 0.12.1",
  "log",
  "multimap",
@@ -6583,7 +6578,7 @@ name = "selector_generator"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.5.18",
+ "clap 4.5.20",
  "ethabi",
  "glob",
  "hex",
@@ -7907,6 +7902,16 @@ dependencies = [
  "winapi-util",
 ]
 
+[[package]]
+name = "terminal_size"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f599bd7ca042cfdf8f4512b277c02ba102247820f9d9d4a9f521f496751a6ef"
+dependencies = [
+ "rustix",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "test-casing"
 version = "0.1.3"
@@ -8751,11 +8756,11 @@ version = "0.1.0"
 dependencies = [
  "assert_matches",
  "criterion",
- "iai",
  "once_cell",
  "rand 0.8.5",
  "tokio",
  "vise",
+ "yab",
  "zksync_contracts",
  "zksync_multivm",
  "zksync_types",
@@ -9239,6 +9244,18 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "yab"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b06cc62d4cec617d3c259537be0fcaa8a5bcf72ddf2983823d9528605f36ed3"
+dependencies = [
+ "anes",
+ "clap 4.5.20",
+ "num_cpus",
+ "thiserror",
+]
+
 [[package]]
 name = "yansi"
 version = "1.0.1"
@@ -10229,7 +10246,7 @@ dependencies = [
  "anyhow",
  "assert_matches",
  "async-trait",
- "clap 4.5.18",
+ "clap 4.5.20",
  "envy",
  "futures 0.3.30",
  "rustc_version",
@@ -10437,7 +10454,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "assert_matches",
- "clap 4.5.18",
+ "clap 4.5.20",
  "insta",
  "leb128",
  "once_cell",
@@ -11017,7 +11034,7 @@ name = "zksync_server"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.5.18",
+ "clap 4.5.20",
  "futures 0.3.30",
  "serde_json",
  "tikv-jemallocator",
diff --git a/Cargo.toml b/Cargo.toml
index dc6fdf1727e3..6d51e5060aa8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -122,6 +122,7 @@ derive_more = "1.0.0"
 envy = "0.4"
 ethabi = "18.0.0"
 flate2 = "1.0.28"
+fraction = "0.15.3"
 futures = "0.3"
 glob = "0.3"
 google-cloud-auth = "0.16.0"
@@ -131,7 +132,6 @@ hex = "0.4"
 http = "1.1"
 httpmock = "0.7.0"
 hyper = "1.3"
-iai = "0.1"
 insta = "1.29.0"
 itertools = "0.10"
 jsonrpsee = { version = "0.23", default-features = false }
@@ -190,7 +190,7 @@ tracing-opentelemetry = "0.25.0"
 time = "0.3.36"                                                               # Has to be same as used by `tracing-subscriber`
 url = "2"
 web3 = "0.19.0"
-fraction = "0.15.3"
+yab = "0.1.0"
 
 # Proc-macro
 syn = "2.0"
diff --git a/core/lib/multivm/src/versions/vm_fast/mod.rs b/core/lib/multivm/src/versions/vm_fast/mod.rs
index bb5a342bff28..35789c6cdc9a 100644
--- a/core/lib/multivm/src/versions/vm_fast/mod.rs
+++ b/core/lib/multivm/src/versions/vm_fast/mod.rs
@@ -1,4 +1,4 @@
-pub use zksync_vm2::interface::Tracer;
+pub use zksync_vm2::interface;
 
 pub use self::{circuits_tracer::CircuitsTracer, vm::Vm};
 
diff --git a/core/lib/vm_executor/src/batch/factory.rs b/core/lib/vm_executor/src/batch/factory.rs
index f974d17f4a75..de0db5f0bf75 100644
--- a/core/lib/vm_executor/src/batch/factory.rs
+++ b/core/lib/vm_executor/src/batch/factory.rs
@@ -37,7 +37,7 @@ pub trait BatchTracer: fmt::Debug + 'static + Send + Sealed {
     const TRACE_CALLS: bool;
     /// Tracer for the fast VM.
     #[doc(hidden)]
-    type Fast: vm_fast::Tracer + Default + 'static;
+    type Fast: vm_fast::interface::Tracer + Default + 'static;
 }
 
 impl Sealed for () {}
diff --git a/core/tests/vm-benchmark/Cargo.toml b/core/tests/vm-benchmark/Cargo.toml
index 59c1e21493b4..892bcf1c1051 100644
--- a/core/tests/vm-benchmark/Cargo.toml
+++ b/core/tests/vm-benchmark/Cargo.toml
@@ -21,7 +21,7 @@ tokio.workspace = true
 
 [dev-dependencies]
 assert_matches.workspace = true
-iai.workspace = true
+yab.workspace = true
 
 [[bench]]
 name = "oneshot"
@@ -32,5 +32,5 @@ name = "batch"
 harness = false
 
 [[bench]]
-name = "iai"
+name = "instructions"
 harness = false
diff --git a/core/tests/vm-benchmark/benches/iai.rs b/core/tests/vm-benchmark/benches/iai.rs
deleted file mode 100644
index 8cbb9f10dd83..000000000000
--- a/core/tests/vm-benchmark/benches/iai.rs
+++ /dev/null
@@ -1,35 +0,0 @@
-use iai::black_box;
-use vm_benchmark::{BenchmarkingVm, BenchmarkingVmFactory, Bytecode, Fast, Legacy};
-
-fn run_bytecode<VM: BenchmarkingVmFactory>(name: &str) {
-    let tx = Bytecode::get(name).deploy_tx();
-    black_box(BenchmarkingVm::<VM>::default().run_transaction(&tx));
-}
-
-macro_rules! make_functions_and_main {
-    ($($file:ident => $legacy_name:ident,)+) => {
-        $(
-        fn $file() {
-            run_bytecode::<Fast>(stringify!($file));
-        }
-
-        fn $legacy_name() {
-            run_bytecode::<Legacy>(stringify!($file));
-        }
-        )+
-
-        iai::main!($($file, $legacy_name,)+);
-    };
-}
-
-make_functions_and_main!(
-    access_memory => access_memory_legacy,
-    call_far => call_far_legacy,
-    decode_shl_sub => decode_shl_sub_legacy,
-    deploy_simple_contract => deploy_simple_contract_legacy,
-    finish_eventful_frames => finish_eventful_frames_legacy,
-    write_and_decode => write_and_decode_legacy,
-    event_spam => event_spam_legacy,
-    slot_hash_collision => slot_hash_collision_legacy,
-    heap_read_write => heap_read_write_legacy,
-);
diff --git a/core/tests/vm-benchmark/benches/instructions.rs b/core/tests/vm-benchmark/benches/instructions.rs
new file mode 100644
index 000000000000..654dfef71b29
--- /dev/null
+++ b/core/tests/vm-benchmark/benches/instructions.rs
@@ -0,0 +1,206 @@
+//! Measures the number of host instructions required to run the benchmark bytecodes.
+
+use std::{env, sync::mpsc};
+
+use vise::{Gauge, LabeledFamily, Metrics};
+use vm_benchmark::{
+    criterion::PrometheusRuntime, BenchmarkingVm, BenchmarkingVmFactory, Fast, Legacy, BYTECODES,
+};
+use yab::{
+    reporter::{BenchmarkOutput, BenchmarkReporter, Reporter},
+    AccessSummary, BenchMode, Bencher, BenchmarkId,
+};
+
+fn benchmarks_for_vm<VM: BenchmarkingVmFactory>(bencher: &mut Bencher) {
+    bencher.bench(
+        BenchmarkId::new("init", VM::LABEL.as_str()),
+        BenchmarkingVm::<VM>::default,
+    );
+
+    for bytecode in BYTECODES {
+        bencher.bench_with_capture(
+            BenchmarkId::new(bytecode.name, VM::LABEL.as_str()),
+            |capture| {
+                let mut vm = yab::black_box(BenchmarkingVm::<VM>::default());
+                let tx = yab::black_box(bytecode.deploy_tx());
+                capture.measure(|| vm.run_transaction(&tx));
+            },
+        );
+    }
+}
+
+/// Reporter that pushes cachegrind metrics to Prometheus.
+#[derive(Debug)]
+struct MetricsReporter {
+    _runtime: Option<PrometheusRuntime>,
+}
+
+impl Default for MetricsReporter {
+    fn default() -> Self {
+        Self {
+            _runtime: PrometheusRuntime::new(),
+        }
+    }
+}
+
+impl Reporter for MetricsReporter {
+    fn new_benchmark(&mut self, id: &BenchmarkId) -> Box<dyn BenchmarkReporter> {
+        Box::new(MetricsBenchmarkReporter(id.clone()))
+    }
+}
+
+#[derive(Debug)]
+struct MetricsBenchmarkReporter(BenchmarkId);
+
+impl BenchmarkReporter for MetricsBenchmarkReporter {
+    fn ok(self: Box<Self>, output: &BenchmarkOutput) {
+        #[derive(Debug, Metrics)]
+        #[metrics(prefix = "vm_cachegrind")]
+        struct VmCachegrindMetrics {
+            #[metrics(labels = ["benchmark"])]
+            instructions: LabeledFamily<String, Gauge<u64>>,
+            #[metrics(labels = ["benchmark"])]
+            l1_accesses: LabeledFamily<String, Gauge<u64>>,
+            #[metrics(labels = ["benchmark"])]
+            l2_accesses: LabeledFamily<String, Gauge<u64>>,
+            #[metrics(labels = ["benchmark"])]
+            ram_accesses: LabeledFamily<String, Gauge<u64>>,
+            #[metrics(labels = ["benchmark"])]
+            cycles: LabeledFamily<String, Gauge<u64>>,
+        }
+
+        #[vise::register]
+        static VM_CACHEGRIND_METRICS: vise::Global<VmCachegrindMetrics> = vise::Global::new();
+
+        let id = self.0.to_string();
+        VM_CACHEGRIND_METRICS.instructions[&id].set(output.stats.total_instructions());
+        if let Some(&full) = output.stats.as_full() {
+            let summary = AccessSummary::from(full);
+            VM_CACHEGRIND_METRICS.l1_accesses[&id].set(summary.l1_hits);
+            VM_CACHEGRIND_METRICS.l2_accesses[&id].set(summary.l3_hits);
+            VM_CACHEGRIND_METRICS.ram_accesses[&id].set(summary.ram_accesses);
+            VM_CACHEGRIND_METRICS.cycles[&id].set(summary.estimated_cycles());
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+struct Comparison {
+    current_cycles: u64,
+    prev_cycles: Option<u64>,
+}
+
+impl Comparison {
+    fn percent_difference(a: u64, b: u64) -> f64 {
+        ((b as i64) - (a as i64)) as f64 / (a as f64) * 100.0
+    }
+
+    fn new(output: &BenchmarkOutput) -> Option<Self> {
+        let current_cycles = AccessSummary::from(*output.stats.as_full()?).estimated_cycles();
+        let prev_cycles = if let Some(prev_stats) = &output.prev_stats {
+            Some(AccessSummary::from(*prev_stats.as_full()?).estimated_cycles())
+        } else {
+            None
+        };
+
+        Some(Self {
+            current_cycles,
+            prev_cycles,
+        })
+    }
+
+    fn cycles_diff(&self) -> Option<f64> {
+        self.prev_cycles
+            .map(|prev_cycles| Self::percent_difference(prev_cycles, self.current_cycles))
+    }
+}
+
+/// Reporter that outputs diffs in a Markdown table to stdout after all benchmarks are completed.
+///
+/// Significant diff level can be changed via `BENCHMARK_DIFF_THRESHOLD_PERCENT` env var; it is set to 1% by default.
+#[derive(Debug)]
+struct ComparisonReporter {
+    comparisons_sender: mpsc::Sender<(String, Comparison)>,
+    comparisons_receiver: mpsc::Receiver<(String, Comparison)>,
+}
+
+impl Default for ComparisonReporter {
+    fn default() -> Self {
+        let (comparisons_sender, comparisons_receiver) = mpsc::channel();
+        Self {
+            comparisons_sender,
+            comparisons_receiver,
+        }
+    }
+}
+
+impl Reporter for ComparisonReporter {
+    fn new_benchmark(&mut self, id: &BenchmarkId) -> Box<dyn BenchmarkReporter> {
+        Box::new(BenchmarkComparison {
+            comparisons_sender: self.comparisons_sender.clone(),
+            id: id.clone(),
+        })
+    }
+
+    fn ok(self: Box<Self>) {
+        const ENV_VAR: &str = "BENCHMARK_DIFF_THRESHOLD_PERCENT";
+
+        let diff_threshold = env::var(ENV_VAR).unwrap_or_else(|_| "1.0".into());
+        let diff_threshold: f64 = diff_threshold.parse().unwrap_or_else(|err| {
+            panic!("incorrect `{ENV_VAR}` value: {err}");
+        });
+
+        // Drop the sender to not hang on the iteration below.
+        drop(self.comparisons_sender);
+        let mut comparisons: Vec<_> = self.comparisons_receiver.iter().collect();
+        comparisons.retain(|(_, diff)| {
+            // Output all stats if `diff_threshold <= 0.0` since this is what the user expects
+            diff.cycles_diff().unwrap_or(0.0) >= diff_threshold
+        });
+        if comparisons.is_empty() {
+            return;
+        }
+
+        comparisons.sort_unstable_by(|(name, _), (other_name, _)| name.cmp(other_name));
+
+        println!("\n## Detected VM performance changes");
+        println!("Benchmark name | Est. cycles | Change in est. cycles |");
+        println!("|:---|---:|---:|");
+        for (name, comparison) in &comparisons {
+            let diff = comparison
+                .cycles_diff()
+                .map_or_else(|| "N/A".to_string(), |diff| format!("{diff:+.1}%"));
+            println!("| {name} | {} | {diff} |", comparison.current_cycles);
+        }
+    }
+}
+
+#[derive(Debug)]
+struct BenchmarkComparison {
+    comparisons_sender: mpsc::Sender<(String, Comparison)>,
+    id: BenchmarkId,
+}
+
+impl BenchmarkReporter for BenchmarkComparison {
+    fn ok(self: Box<Self>, output: &BenchmarkOutput) {
+        if let Some(diff) = Comparison::new(output) {
+            self.comparisons_sender
+                .send((self.id.to_string(), diff))
+                .ok();
+        }
+    }
+}
+
+fn benchmarks(bencher: &mut Bencher) {
+    if bencher.mode() == BenchMode::PrintResults {
+        // Only customize reporting if outputting previously collected benchmark result in order to prevent
+        // reporters influencing cachegrind stats.
+        bencher
+            .add_reporter(MetricsReporter::default())
+            .add_reporter(ComparisonReporter::default());
+    }
+    benchmarks_for_vm::<Fast>(bencher);
+    benchmarks_for_vm::<Legacy>(bencher);
+}
+
+yab::main!(benchmarks);
diff --git a/core/tests/vm-benchmark/src/bin/common/mod.rs b/core/tests/vm-benchmark/src/bin/common/mod.rs
deleted file mode 100644
index a92c9d5f710c..000000000000
--- a/core/tests/vm-benchmark/src/bin/common/mod.rs
+++ /dev/null
@@ -1,54 +0,0 @@
-use std::io::BufRead;
-
-#[derive(Debug)]
-pub struct IaiResult {
-    pub name: String,
-    pub instructions: u64,
-    pub l1_accesses: u64,
-    pub l2_accesses: u64,
-    pub ram_accesses: u64,
-    pub cycles: u64,
-}
-
-pub fn parse_iai<R: BufRead>(iai_output: R) -> impl Iterator<Item = IaiResult> {
-    IaiResultParser {
-        lines: iai_output.lines().map(|x| x.unwrap()),
-    }
-}
-
-struct IaiResultParser<I: Iterator<Item = String>> {
-    lines: I,
-}
-
-impl<I: Iterator<Item = String>> Iterator for IaiResultParser<I> {
-    type Item = IaiResult;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        self.lines.next().map(|name| {
-            let result = IaiResult {
-                name,
-                instructions: self.parse_stat(),
-                l1_accesses: self.parse_stat(),
-                l2_accesses: self.parse_stat(),
-                ram_accesses: self.parse_stat(),
-                cycles: self.parse_stat(),
-            };
-            self.lines.next();
-            result
-        })
-    }
-}
-
-impl<I: Iterator<Item = String>> IaiResultParser<I> {
-    fn parse_stat(&mut self) -> u64 {
-        let line = self.lines.next().unwrap();
-        let number = line
-            .split(':')
-            .nth(1)
-            .unwrap()
-            .split_whitespace()
-            .next()
-            .unwrap();
-        number.parse().unwrap()
-    }
-}
diff --git a/core/tests/vm-benchmark/src/bin/compare_iai_results.rs b/core/tests/vm-benchmark/src/bin/compare_iai_results.rs
deleted file mode 100644
index c274b039c9bd..000000000000
--- a/core/tests/vm-benchmark/src/bin/compare_iai_results.rs
+++ /dev/null
@@ -1,108 +0,0 @@
-use std::{
-    collections::{HashMap, HashSet},
-    fs::File,
-    io::{BufRead, BufReader},
-};
-
-pub use crate::common::parse_iai;
-
-mod common;
-
-fn main() {
-    let [iai_before, iai_after, opcodes_before, opcodes_after] = std::env::args()
-        .skip(1)
-        .take(4)
-        .collect::<Vec<_>>()
-        .try_into()
-        .expect("expected four arguments");
-
-    let iai_before = get_name_to_cycles(&iai_before);
-    let iai_after = get_name_to_cycles(&iai_after);
-    let opcodes_before = get_name_to_opcodes(&opcodes_before);
-    let opcodes_after = get_name_to_opcodes(&opcodes_after);
-
-    let perf_changes = iai_before
-        .keys()
-        .collect::<HashSet<_>>()
-        .intersection(&iai_after.keys().collect())
-        .map(|&name| (name, percent_difference(iai_before[name], iai_after[name])))
-        .collect::<HashMap<_, _>>();
-
-    let duration_changes = opcodes_before
-        .keys()
-        .collect::<HashSet<_>>()
-        .intersection(&opcodes_after.keys().collect())
-        .map(|&name| {
-            let opcodes_abs_diff = (opcodes_after[name] as i64) - (opcodes_before[name] as i64);
-            (name, opcodes_abs_diff)
-        })
-        .collect::<HashMap<_, _>>();
-
-    let mut nonzero_diff = false;
-
-    for name in perf_changes
-        .iter()
-        .filter_map(|(key, value)| (value.abs() > 2.).then_some(key))
-        .collect::<HashSet<_>>()
-        .union(
-            &duration_changes
-                .iter()
-                .filter_map(|(key, value)| (*value != 0).then_some(key))
-                .collect(),
-        )
-    {
-        // write the header before writing the first line of diff
-        if !nonzero_diff {
-            println!("Benchmark name | change in estimated runtime | change in number of opcodes executed \n--- | --- | ---");
-            nonzero_diff = true;
-        }
-
-        let n_a = "N/A".to_string();
-        println!(
-            "{} | {} | {}",
-            name,
-            perf_changes
-                .get(**name)
-                .map(|percent| format!("{:+.1}%", percent))
-                .unwrap_or(n_a.clone()),
-            duration_changes
-                .get(**name)
-                .map(|abs_diff| format!(
-                    "{:+} ({:+.1}%)",
-                    abs_diff,
-                    percent_difference(opcodes_before[**name], opcodes_after[**name])
-                ))
-                .unwrap_or(n_a),
-        );
-    }
-
-    if nonzero_diff {
-        println!("\n Changes in number of opcodes executed indicate that the gas price of the benchmark has changed, which causes it run out of gas at a different time. Or that it is behaving completely differently.");
-    }
-}
-
-fn percent_difference(a: u64, b: u64) -> f64 {
-    ((b as f64) - (a as f64)) / (a as f64) * 100.0
-}
-
-fn get_name_to_cycles(filename: &str) -> HashMap<String, u64> {
-    parse_iai(BufReader::new(
-        File::open(filename).expect("failed to open file"),
-    ))
-    .map(|x| (x.name, x.cycles))
-    .collect()
-}
-
-fn get_name_to_opcodes(filename: &str) -> HashMap<String, u64> {
-    BufReader::new(File::open(filename).expect("failed to open file"))
-        .lines()
-        .map(|line| {
-            let line = line.unwrap();
-            let mut it = line.split_whitespace();
-            (
-                it.next().unwrap().to_string(),
-                it.next().unwrap().parse().unwrap(),
-            )
-        })
-        .collect()
-}
diff --git a/core/tests/vm-benchmark/src/bin/iai_results_to_prometheus.rs b/core/tests/vm-benchmark/src/bin/iai_results_to_prometheus.rs
deleted file mode 100644
index 3b3aa05bf69c..000000000000
--- a/core/tests/vm-benchmark/src/bin/iai_results_to_prometheus.rs
+++ /dev/null
@@ -1,52 +0,0 @@
-use std::{env, io::BufReader, time::Duration};
-
-use tokio::sync::watch;
-use vise::{Gauge, LabeledFamily, Metrics};
-use zksync_vlog::prometheus::PrometheusExporterConfig;
-
-use crate::common::{parse_iai, IaiResult};
-
-mod common;
-
-#[derive(Debug, Metrics)]
-#[metrics(prefix = "vm_cachegrind")]
-pub(crate) struct VmCachegrindMetrics {
-    #[metrics(labels = ["benchmark"])]
-    pub instructions: LabeledFamily<String, Gauge<u64>>,
-    #[metrics(labels = ["benchmark"])]
-    pub l1_accesses: LabeledFamily<String, Gauge<u64>>,
-    #[metrics(labels = ["benchmark"])]
-    pub l2_accesses: LabeledFamily<String, Gauge<u64>>,
-    #[metrics(labels = ["benchmark"])]
-    pub ram_accesses: LabeledFamily<String, Gauge<u64>>,
-    #[metrics(labels = ["benchmark"])]
-    pub cycles: LabeledFamily<String, Gauge<u64>>,
-}
-
-#[vise::register]
-pub(crate) static VM_CACHEGRIND_METRICS: vise::Global<VmCachegrindMetrics> = vise::Global::new();
-
-#[tokio::main]
-async fn main() {
-    let results: Vec<IaiResult> = parse_iai(BufReader::new(std::io::stdin())).collect();
-
-    let endpoint = env::var("BENCHMARK_PROMETHEUS_PUSHGATEWAY_URL")
-        .expect("`BENCHMARK_PROMETHEUS_PUSHGATEWAY_URL` env var is not set");
-    let (stop_sender, stop_receiver) = watch::channel(false);
-    let prometheus_config =
-        PrometheusExporterConfig::push(endpoint.to_owned(), Duration::from_millis(100));
-    tokio::spawn(prometheus_config.run(stop_receiver));
-
-    for result in results {
-        let name = result.name;
-        VM_CACHEGRIND_METRICS.instructions[&name.clone()].set(result.instructions);
-        VM_CACHEGRIND_METRICS.l1_accesses[&name.clone()].set(result.l1_accesses);
-        VM_CACHEGRIND_METRICS.l2_accesses[&name.clone()].set(result.l2_accesses);
-        VM_CACHEGRIND_METRICS.ram_accesses[&name.clone()].set(result.ram_accesses);
-        VM_CACHEGRIND_METRICS.cycles[&name].set(result.cycles);
-    }
-
-    println!("Waiting for push to happen...");
-    tokio::time::sleep(Duration::from_secs(1)).await;
-    stop_sender.send_replace(true);
-}
diff --git a/core/tests/vm-benchmark/src/bin/instruction_counts.rs b/core/tests/vm-benchmark/src/bin/instruction_counts.rs
index 96208007fd97..ece30a66cee3 100644
--- a/core/tests/vm-benchmark/src/bin/instruction_counts.rs
+++ b/core/tests/vm-benchmark/src/bin/instruction_counts.rs
@@ -1,16 +1,100 @@
 //! Runs all benchmarks and prints out the number of zkEVM opcodes each one executed.
 
-use vm_benchmark::{BenchmarkingVmFactory, Fast, Legacy, BYTECODES};
+use std::{collections::BTreeMap, env, fs, io, path::PathBuf};
 
-fn main() {
-    for bytecode in BYTECODES {
-        let tx = bytecode.deploy_tx();
-        let name = bytecode.name;
-        println!("{name} {}", Fast::<()>::count_instructions(&tx));
-        println!(
-            "{} {}",
-            name.to_string() + "_legacy",
-            Legacy::count_instructions(&tx)
-        );
+use vm_benchmark::{CountInstructions, Fast, Legacy, BYTECODES};
+
+#[derive(Debug)]
+enum Command {
+    Print,
+    Diff { old: PathBuf },
+}
+
+impl Command {
+    fn from_env() -> Self {
+        let mut args = env::args().skip(1);
+        let Some(first) = args.next() else {
+            return Self::Print;
+        };
+        assert_eq!(first, "--diff", "Unsupported command-line arg");
+        let old = args.next().expect("`--diff` requires a path to old file");
+        Self::Diff { old: old.into() }
     }
+
+    fn print_instructions(counts: &BTreeMap<&str, usize>) {
+        for (bytecode_name, count) in counts {
+            println!("{bytecode_name} {count}");
+        }
+    }
+
+    fn parse_counts(reader: impl io::BufRead) -> BTreeMap<String, usize> {
+        let mut counts = BTreeMap::new();
+        for line in reader.lines() {
+            let line = line.unwrap();
+            if line.is_empty() {
+                continue;
+            }
+            let (name, count) = line.split_once(' ').expect("invalid output format");
+            let count = count.parse().unwrap_or_else(|err| {
+                panic!("invalid count for `{name}`: {err}");
+            });
+            counts.insert(name.to_owned(), count);
+        }
+        counts
+    }
+
+    fn run(self) {
+        let counts: BTreeMap<_, _> = BYTECODES
+            .iter()
+            .map(|bytecode| {
+                let tx = bytecode.deploy_tx();
+                // We have a unit test comparing stats, but do it here as well just in case.
+                let fast_count = Fast::count_instructions(&tx);
+                let legacy_count = Legacy::count_instructions(&tx);
+                assert_eq!(
+                    fast_count, legacy_count,
+                    "mismatch on number of instructions on bytecode `{}`",
+                    bytecode.name
+                );
+
+                (bytecode.name, fast_count)
+            })
+            .collect();
+
+        match self {
+            Self::Print => Self::print_instructions(&counts),
+            Self::Diff { old } => {
+                let file = fs::File::open(&old).unwrap_or_else(|err| {
+                    panic!("failed opening `{}`: {err}", old.display());
+                });
+                let reader = io::BufReader::new(file);
+                let old_counts = Self::parse_counts(reader);
+
+                let differing_counts: Vec<_> = counts
+                    .iter()
+                    .filter_map(|(&name, &new_count)| {
+                        let old_count = *old_counts.get(name)?;
+                        (old_count != new_count).then_some((name, old_count, new_count))
+                    })
+                    .collect();
+
+                if !differing_counts.is_empty() {
+                    println!("## ⚠ Detected differing instruction counts");
+                    println!("| Benchmark | Old count | New count |");
+                    println!("|-----------|----------:|----------:|");
+                    for (name, old_count, new_count) in differing_counts {
+                        println!("| {name} | {old_count} | {new_count} |");
+                    }
+                    println!(
+                        "\nChanges in number of opcodes executed indicate that the gas price of the benchmark has changed, \
+                         which causes it to run out of gas at a different time."
+                    );
+                }
+            }
+        }
+    }
+}
+
+fn main() {
+    Command::from_env().run();
 }
diff --git a/core/tests/vm-benchmark/src/criterion.rs b/core/tests/vm-benchmark/src/criterion.rs
index 9515ac4ef988..024ccf14139f 100644
--- a/core/tests/vm-benchmark/src/criterion.rs
+++ b/core/tests/vm-benchmark/src/criterion.rs
@@ -57,7 +57,7 @@ struct VmBenchmarkMetrics {
 static METRICS: vise::Global<VmBenchmarkMetrics> = vise::Global::new();
 
 #[derive(Debug)]
-struct PrometheusRuntime {
+pub struct PrometheusRuntime {
     stop_sender: watch::Sender<bool>,
     _runtime: tokio::runtime::Runtime,
 }
@@ -72,7 +72,7 @@ impl Drop for PrometheusRuntime {
 }
 
 impl PrometheusRuntime {
-    fn new() -> Option<Self> {
+    pub fn new() -> Option<Self> {
         const PUSH_INTERVAL: Duration = Duration::from_millis(100);
 
         let gateway_url = env::var("BENCHMARK_PROMETHEUS_PUSHGATEWAY_URL").ok()?;
@@ -164,7 +164,7 @@ thread_local! {
 
 static BIN_NAME: SyncOnceCell<&'static str> = SyncOnceCell::new();
 
-/// Measurement for criterion that exports .
+/// Measurement for criterion that exports timing-related metrics.
 #[derive(Debug)]
 pub struct MeteredTime {
     _prometheus: Option<PrometheusRuntime>,
diff --git a/core/tests/vm-benchmark/src/lib.rs b/core/tests/vm-benchmark/src/lib.rs
index 4bd008d33196..9c4f547c1de2 100644
--- a/core/tests/vm-benchmark/src/lib.rs
+++ b/core/tests/vm-benchmark/src/lib.rs
@@ -6,7 +6,7 @@ pub use crate::{
         get_load_test_deploy_tx, get_load_test_tx, get_realistic_load_test_tx, get_transfer_tx,
         LoadTestParams,
     },
-    vm::{BenchmarkingVm, BenchmarkingVmFactory, Fast, Legacy, VmLabel},
+    vm::{BenchmarkingVm, BenchmarkingVmFactory, CountInstructions, Fast, Legacy, VmLabel},
 };
 
 pub mod criterion;
diff --git a/core/tests/vm-benchmark/src/vm.rs b/core/tests/vm-benchmark/src/vm.rs
index e198be9ea6b2..bf969e0de5c0 100644
--- a/core/tests/vm-benchmark/src/vm.rs
+++ b/core/tests/vm-benchmark/src/vm.rs
@@ -72,19 +72,21 @@ pub trait BenchmarkingVmFactory {
         system_env: SystemEnv,
         storage: &'static InMemoryStorage,
     ) -> Self::Instance;
+}
 
+pub trait CountInstructions {
     /// Counts instructions executed by the VM while processing the transaction.
     fn count_instructions(tx: &Transaction) -> usize;
 }
 
 /// Factory for the new / fast VM.
 #[derive(Debug)]
-pub struct Fast<Tr = ()>(Tr);
+pub struct Fast;
 
-impl<Tr: vm_fast::Tracer + Default + 'static> BenchmarkingVmFactory for Fast<Tr> {
+impl BenchmarkingVmFactory for Fast {
     const LABEL: VmLabel = VmLabel::Fast;
 
-    type Instance = vm_fast::Vm<&'static InMemoryStorage, Tr>;
+    type Instance = vm_fast::Vm<&'static InMemoryStorage>;
 
     fn create(
         batch_env: L1BatchEnv,
@@ -93,27 +95,30 @@ impl<Tr: vm_fast::Tracer + Default + 'static> BenchmarkingVmFactory for Fast<Tr>
     ) -> Self::Instance {
         vm_fast::Vm::custom(batch_env, system_env, storage)
     }
+}
 
+impl CountInstructions for Fast {
     fn count_instructions(tx: &Transaction) -> usize {
-        let mut vm = BenchmarkingVm::<Fast<InstructionCount>>::default();
-        vm.0.push_transaction(tx.clone());
+        use vm_fast::interface as vm2;
 
         #[derive(Default)]
         struct InstructionCount(usize);
-        impl vm_fast::Tracer for InstructionCount {
-            fn before_instruction<
-                OP: zksync_vm2::interface::OpcodeType,
-                S: zksync_vm2::interface::GlobalStateInterface,
-            >(
+
+        impl vm2::Tracer for InstructionCount {
+            fn before_instruction<OP: vm2::OpcodeType, S: vm2::GlobalStateInterface>(
                 &mut self,
                 _: &mut S,
             ) {
                 self.0 += 1;
             }
         }
-        let mut tracer = InstructionCount(0);
 
-        vm.0.inspect(&mut tracer, InspectExecutionMode::OneTx);
+        let (system_env, l1_batch_env) = test_env();
+        let mut vm =
+            vm_fast::Vm::<_, InstructionCount>::custom(l1_batch_env, system_env, &*STORAGE);
+        vm.push_transaction(tx.clone());
+        let mut tracer = InstructionCount(0);
+        vm.inspect(&mut tracer, InspectExecutionMode::OneTx);
         tracer.0
     }
 }
@@ -135,7 +140,9 @@ impl BenchmarkingVmFactory for Legacy {
         let storage = StorageView::new(storage).to_rc_ptr();
         vm_latest::Vm::new(batch_env, system_env, storage)
     }
+}
 
+impl CountInstructions for Legacy {
     fn count_instructions(tx: &Transaction) -> usize {
         let mut vm = BenchmarkingVm::<Self>::default();
         vm.0.push_transaction(tx.clone());
@@ -150,41 +157,44 @@ impl BenchmarkingVmFactory for Legacy {
     }
 }
 
+fn test_env() -> (SystemEnv, L1BatchEnv) {
+    let timestamp = unix_timestamp_ms();
+    let system_env = SystemEnv {
+        zk_porter_available: false,
+        version: ProtocolVersionId::latest(),
+        base_system_smart_contracts: SYSTEM_CONTRACTS.clone(),
+        bootloader_gas_limit: BATCH_COMPUTATIONAL_GAS_LIMIT,
+        execution_mode: TxExecutionMode::VerifyExecute,
+        default_validation_computational_gas_limit: BATCH_COMPUTATIONAL_GAS_LIMIT,
+        chain_id: L2ChainId::from(270),
+    };
+    let l1_batch_env = L1BatchEnv {
+        previous_batch_hash: None,
+        number: L1BatchNumber(1),
+        timestamp,
+        fee_input: BatchFeeInput::l1_pegged(
+            50_000_000_000, // 50 gwei
+            250_000_000,    // 0.25 gwei
+        ),
+        fee_account: Address::random(),
+        enforced_base_fee: None,
+        first_l2_block: L2BlockEnv {
+            number: 1,
+            timestamp,
+            prev_block_hash: L2BlockHasher::legacy_hash(L2BlockNumber(0)),
+            max_virtual_blocks_to_create: 100,
+        },
+    };
+    (system_env, l1_batch_env)
+}
+
 #[derive(Debug)]
 pub struct BenchmarkingVm<VM: BenchmarkingVmFactory>(VM::Instance);
 
 impl<VM: BenchmarkingVmFactory> Default for BenchmarkingVm<VM> {
     fn default() -> Self {
-        let timestamp = unix_timestamp_ms();
-        Self(VM::create(
-            L1BatchEnv {
-                previous_batch_hash: None,
-                number: L1BatchNumber(1),
-                timestamp,
-                fee_input: BatchFeeInput::l1_pegged(
-                    50_000_000_000, // 50 gwei
-                    250_000_000,    // 0.25 gwei
-                ),
-                fee_account: Address::random(),
-                enforced_base_fee: None,
-                first_l2_block: L2BlockEnv {
-                    number: 1,
-                    timestamp,
-                    prev_block_hash: L2BlockHasher::legacy_hash(L2BlockNumber(0)),
-                    max_virtual_blocks_to_create: 100,
-                },
-            },
-            SystemEnv {
-                zk_porter_available: false,
-                version: ProtocolVersionId::latest(),
-                base_system_smart_contracts: SYSTEM_CONTRACTS.clone(),
-                bootloader_gas_limit: BATCH_COMPUTATIONAL_GAS_LIMIT,
-                execution_mode: TxExecutionMode::VerifyExecute,
-                default_validation_computational_gas_limit: BATCH_COMPUTATIONAL_GAS_LIMIT,
-                chain_id: L2ChainId::from(270),
-            },
-            &STORAGE,
-        ))
+        let (system_env, l1_batch_env) = test_env();
+        Self(VM::create(l1_batch_env, system_env, &STORAGE))
     }
 }
 
@@ -231,7 +241,7 @@ mod tests {
     use super::*;
     use crate::{
         get_deploy_tx, get_heavy_load_test_tx, get_load_test_deploy_tx, get_load_test_tx,
-        get_realistic_load_test_tx, get_transfer_tx, LoadTestParams,
+        get_realistic_load_test_tx, get_transfer_tx, LoadTestParams, BYTECODES,
     };
 
     #[test]
@@ -282,4 +292,22 @@ mod tests {
         let res = vm.run_transaction(&get_heavy_load_test_tx(1));
         assert_matches!(res.result, ExecutionResult::Success { .. });
     }
+
+    #[test]
+    fn instruction_count_matches_on_both_vms_for_transfer() {
+        let tx = get_transfer_tx(0);
+        let legacy_count = Legacy::count_instructions(&tx);
+        let fast_count = Fast::count_instructions(&tx);
+        assert_eq!(legacy_count, fast_count);
+    }
+
+    #[test]
+    fn instruction_count_matches_on_both_vms_for_benchmark_bytecodes() {
+        for bytecode in BYTECODES {
+            let tx = bytecode.deploy_tx();
+            let legacy_count = Legacy::count_instructions(&tx);
+            let fast_count = Fast::count_instructions(&tx);
+            assert_eq!(legacy_count, fast_count, "bytecode: {}", bytecode.name);
+        }
+    }
 }