Merge pull request #59 from robertknight/transpose-bench

Add simple benchmark for Transpose ops, extract rten-bench crate
robertknight · Mar 18, 2024 · ff8d22d · ff8d22d
2 parents 677ae51 + 846c040
commit ff8d22d
Show file tree

Hide file tree

Showing 13 changed files with 176 additions and 95 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,11 +2,14 @@
 members = [
   ".",
   "rten-cli",
-  "rten-examples",
   "rten-imageio",
   "rten-imageproc",
   "rten-tensor",
   "rten-text",
+
+  # Development crates. These are not published.
+  "rten-bench",
+  "rten-examples",
 ]
 default-members = [
   ".",
@@ -36,6 +39,7 @@ rten-tensor = { path = "./rten-tensor", version = "0.4.0" }
 rten-vecmath = { path = "./rten-vecmath", version = "0.4.0" }
 
 [dev-dependencies]
+rten-bench = { path = "./rten-bench" }
 serde_json = "1.0.91"
 
 [lib]

diff --git a/rten-bench/Cargo.toml b/rten-bench/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "rten-bench"
+version = "0.1.0"
+edition = "2021"
+authors = ["Robert Knight"]
+description = "Benchmarking utilities for use in RTen development"
+license = "MIT OR Apache-2.0"
+homepage = "https://github.com/robertknight/rten"
+repository = "https://github.com/robertknight/rten"
+
+[lib]
+crate-type = ["lib"]
diff --git a/rten-bench/src/lib.rs b/rten-bench/src/lib.rs
@@ -0,0 +1,65 @@
+use std::fmt::Display;
+use std::time::Instant;
+
+/// Statistics from a benchmark run. All fields are durations in milliseconds.
+#[derive(Default)]
+pub struct BenchStats {
+    /// Duration of longest run.
+    pub max: f32,
+
+    /// Mean duration.
+    pub mean: f32,
+
+    /// Median duration.
+    pub median: f32,
+
+    /// Minimum duration.
+    pub min: f32,
+
+    /// Variance of durations.
+    pub var: f32,
+}
+
+/// Run a benchmark function `f` for `trials` iterations and print statistics
+/// about the run.
+pub fn run_bench<F: FnMut(), D: Display>(trials: usize, description: D, mut f: F) -> BenchStats {
+    if trials == 0 {
+        return BenchStats::default();
+    }
+
+    let mut times = Vec::with_capacity(trials);
+    for _ in 0..trials {
+        let start = Instant::now();
+
+        f();
+
+        let duration_ms = start.elapsed().as_secs_f64() * 1000.0;
+        times.push(duration_ms as f32);
+    }
+
+    times.sort_by(|a, b| a.total_cmp(b));
+    let min = times.first().copied().unwrap();
+    let max = times.last().copied().unwrap();
+
+    let mid = times.len() / 2;
+    let median = if times.len() % 2 == 1 {
+        times[mid]
+    } else {
+        (times[mid] + times[mid + 1]) / 2.
+    };
+    let mean = times.iter().sum::<f32>() / times.len() as f32;
+    let var = times.iter().map(|x| (x - mean).abs()).sum::<f32>() / times.len() as f32;
+
+    println!(
+        "{}. mean {:.3}ms median {:.3} var {:.3} min {:.3} max {:.3}",
+        description, mean, median, var, min, max
+    );
+
+    BenchStats {
+        max,
+        mean,
+        median,
+        min,
+        var,
+    }
+}
diff --git a/rten-imageproc/Cargo.toml b/rten-imageproc/Cargo.toml
@@ -11,5 +11,8 @@ repository = "https://github.com/robertknight/rten"
 [dependencies]
 rten-tensor = { path = "../rten-tensor", version = "0.4.0" }
 
+[dev-dependencies]
+rten-bench = { path = "../rten-bench" }
+
 [lib]
 crate-type = ["lib"]
diff --git a/rten-imageproc/src/contours.rs b/rten-imageproc/src/contours.rs
@@ -439,7 +439,7 @@ mod tests {
     #[test]
     #[ignore]
     fn bench_find_contours() {
-        use rten_tensor::test_util::bench_loop;
+        use rten_bench::run_bench;
 
         // Fill a mask with a grid of rectangular objects.
         let mask_h = 1024;
@@ -465,14 +465,9 @@ mod tests {
         }
 
         let n_iters = 100;
-        let stats = bench_loop(n_iters, || {
+        run_bench(n_iters, "find_contours", || {
             let contours = find_contours(mask.view(), RetrievalMode::External);
             assert_eq!(contours.len(), (grid_rows * grid_cols) as usize);
         });
-        println!(
-            "find_contours {:.3} ms, {:.3} ns/elem",
-            stats.duration_ms(),
-            stats.duration_ns() / (mask_h * mask_w * n_iters) as f64
-        );
     }
 }
diff --git a/rten-tensor/src/test_util.rs b/rten-tensor/src/test_util.rs
@@ -191,45 +191,6 @@ pub fn eq_with_nans(a: TensorView, b: TensorView) -> bool {
     }
 }
 
-pub struct BenchStats {
-    /// Duration in seconds.
-    duration: f64,
-}
-
-const SECS_TO_MS: f64 = 1000.;
-const SECS_TO_US: f64 = 1_000_000.;
-const SECS_TO_NS: f64 = 1_000_000_000.;
-
-impl BenchStats {
-    /// Return total duration in milliseconds.
-    pub fn duration_ms(&self) -> f64 {
-        self.duration * SECS_TO_MS
-    }
-
-    /// Return total duration in microseconds.
-    pub fn duration_us(&self) -> f64 {
-        self.duration * SECS_TO_US
-    }
-
-    /// Return total duration in nanoseconds.
-    pub fn duration_ns(&self) -> f64 {
-        self.duration * SECS_TO_NS
-    }
-}
-
-/// A very simple benchmark helper which runs `f` for `n_iters` iterations.
-pub fn bench_loop<F: FnMut()>(n_iters: usize, mut f: F) -> BenchStats {
-    let start = std::time::Instant::now();
-
-    for _ in 0..n_iters {
-        f();
-    }
-
-    BenchStats {
-        duration: start.elapsed().as_secs_f64(),
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::ApproxEq;

diff --git a/src/gemm.rs b/src/gemm.rs
@@ -910,6 +910,7 @@ mod tests {
     use std::error::Error;
     use std::ops::Range;
 
+    use rten_bench::run_bench;
     use rten_tensor::prelude::*;
     use rten_tensor::rng::XorShiftRng;
     use rten_tensor::test_util::expect_equal;
@@ -919,7 +920,6 @@ mod tests {
         add_scaled_vector, gemm, round_up, GemmExecutor, GemmInputA, GemmInputB, KernelHint,
         VirtualMatrix,
     };
-    use crate::test_util::run_bench;
 
     fn reference_matmul(a: &Tensor, b: &Tensor) -> Tensor {
         let [a_rows, _a_cols]: [usize; 2] = a.shape().try_into().expect("input should be a matrix");

diff --git a/src/lib.rs b/src/lib.rs
@@ -61,6 +61,3 @@ mod schema_generated;
 // create an abstraction around model execution instead.
 #[doc(hidden)]
 pub mod model_builder;
-
-#[cfg(test)]
-mod test_util;
diff --git a/src/ops/conv.rs b/src/ops/conv.rs
@@ -1313,10 +1313,11 @@ mod tests {
     #[test]
     #[ignore]
     fn bench_col2im() {
-        use super::col2im;
-        use rten_tensor::test_util::bench_loop;
+        use rten_bench::run_bench;
         use rten_tensor::NdTensor;
 
+        use super::col2im;
+
         let out_chans = 32;
         let in_height = 64;
         let in_width = 64;
@@ -1333,14 +1334,12 @@ mod tests {
             &mut rng,
         );
 
-        let stats = bench_loop(100, || {
+        run_bench(100, "col2im", || {
             col2im(
                 &mut output.view_mut(),
                 &columns.view(),
                 [stride_y, stride_x],
             );
         });
-
-        println!("col2im duration {:3} ms", stats.duration_ms());
     }
 }
diff --git a/src/ops/layout.rs b/src/ops/layout.rs
@@ -519,6 +519,7 @@ impl Operator for Unsqueeze {
 mod tests {
     use std::error::Error;
 
+    use rten_bench::run_bench;
     use rten_tensor::prelude::*;
     use rten_tensor::rng::XorShiftRng;
     use rten_tensor::test_util::expect_equal;
@@ -910,4 +911,80 @@ mod tests {
             Some(OpError::InvalidValue("Axes must be unique"))
         );
     }
+
+    #[test]
+    #[ignore]
+    fn bench_transpose() {
+        let mut rng = XorShiftRng::new(1234);
+
+        struct Case<'a> {
+            /// Input shape
+            shape: &'a [usize],
+
+            /// Permutation order (eg. `[1, 0]` for a matrix transpose)
+            perm: &'a [usize],
+        }
+
+        let cases = [
+            // No-op transpose
+            Case {
+                shape: &[512, 512],
+                perm: &[0, 1],
+            },
+            // Matrix transpose of different sizes
+            Case {
+                shape: &[256, 256],
+                perm: &[1, 0],
+            },
+            Case {
+                shape: &[512, 512],
+                perm: &[1, 0],
+            },
+            Case {
+                shape: &[1024, 1024],
+                perm: &[1, 0],
+            },
+            // Transpose ops taken from Whisper encoder (base model) with 4
+            // batches of samples
+            Case {
+                shape: &[4, 1500, 8, 64],
+                perm: &[0, 2, 1, 3],
+            },
+            Case {
+                shape: &[4, 8, 1500, 64],
+                perm: &[0, 2, 1, 3],
+            },
+            // Transpose ops taken from Whisper decoder (base model)
+            Case {
+                shape: &[1, 1500, 8, 64],
+                perm: &[0, 2, 3, 1],
+            },
+            Case {
+                shape: &[1, 288, 8, 64],
+                perm: &[0, 2, 1, 3],
+            },
+        ];
+
+        for Case { shape, perm } in cases {
+            let tensor = Tensor::rand(shape, &mut rng);
+
+            // Do a simple copy. This provides a lower-bound on how fast
+            // transpose can operate.
+            let copy_stats = run_bench(100, format!("copy {:?}", shape), || {
+                tensor.view().to_tensor();
+            });
+
+            let transpose_stats = run_bench(
+                100,
+                format!("transpose {:?} perm {:?}", shape, perm),
+                || {
+                    transpose(tensor.view(), Some(perm)).unwrap();
+                },
+            );
+
+            let transpose_overhead =
+                (transpose_stats.mean - copy_stats.mean).max(0.) / copy_stats.mean;
+            println!("transpose {:?} overhead {}", shape, transpose_overhead);
+        }
+    }
 }
diff --git a/src/ops/matmul.rs b/src/ops/matmul.rs
@@ -226,13 +226,13 @@ impl Operator for MatMul {
 mod tests {
     use std::error::Error;
 
+    use rten_bench::run_bench;
     use rten_tensor::prelude::*;
     use rten_tensor::rng::XorShiftRng;
     use rten_tensor::test_util::expect_equal;
     use rten_tensor::{Tensor, TensorView, TensorViewMut};
 
     use crate::gemm::gemm;
-    use crate::test_util::run_bench;
 
     use super::{gemm_op, matmul, matmul_impl, MatmulStrategy, OpError};