Skip to content

Commit

Permalink
Merge pull request #59 from robertknight/transpose-bench
Browse files Browse the repository at this point in the history
Add simple benchmark for Transpose ops, extract rten-bench crate
  • Loading branch information
robertknight authored Mar 18, 2024
2 parents 677ae51 + 846c040 commit ff8d22d
Show file tree
Hide file tree
Showing 13 changed files with 176 additions and 95 deletions.
6 changes: 6 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@
members = [
".",
"rten-cli",
"rten-examples",
"rten-imageio",
"rten-imageproc",
"rten-tensor",
"rten-text",

# Development crates. These are not published.
"rten-bench",
"rten-examples",
]
default-members = [
".",
Expand Down Expand Up @@ -36,6 +39,7 @@ rten-tensor = { path = "./rten-tensor", version = "0.4.0" }
rten-vecmath = { path = "./rten-vecmath", version = "0.4.0" }

[dev-dependencies]
rten-bench = { path = "./rten-bench" }
serde_json = "1.0.91"

[lib]
Expand Down
12 changes: 12 additions & 0 deletions rten-bench/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[package]
name = "rten-bench"
version = "0.1.0"
edition = "2021"
authors = ["Robert Knight"]
description = "Benchmarking utilities for use in RTen development"
license = "MIT OR Apache-2.0"
homepage = "https://github.com/robertknight/rten"
repository = "https://github.com/robertknight/rten"

[lib]
crate-type = ["lib"]
65 changes: 65 additions & 0 deletions rten-bench/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
use std::fmt::Display;
use std::time::Instant;

/// Statistics from a benchmark run. All fields are durations in milliseconds.
#[derive(Default)]
pub struct BenchStats {
/// Duration of longest run.
pub max: f32,

/// Mean duration.
pub mean: f32,

/// Median duration.
pub median: f32,

/// Minimum duration.
pub min: f32,

/// Variance of durations.
pub var: f32,
}

/// Run a benchmark function `f` for `trials` iterations and print statistics
/// about the run.
pub fn run_bench<F: FnMut(), D: Display>(trials: usize, description: D, mut f: F) -> BenchStats {
if trials == 0 {
return BenchStats::default();
}

let mut times = Vec::with_capacity(trials);
for _ in 0..trials {
let start = Instant::now();

f();

let duration_ms = start.elapsed().as_secs_f64() * 1000.0;
times.push(duration_ms as f32);
}

times.sort_by(|a, b| a.total_cmp(b));
let min = times.first().copied().unwrap();
let max = times.last().copied().unwrap();

let mid = times.len() / 2;
let median = if times.len() % 2 == 1 {
times[mid]
} else {
(times[mid] + times[mid + 1]) / 2.
};
let mean = times.iter().sum::<f32>() / times.len() as f32;
let var = times.iter().map(|x| (x - mean).abs()).sum::<f32>() / times.len() as f32;

println!(
"{}. mean {:.3}ms median {:.3} var {:.3} min {:.3} max {:.3}",
description, mean, median, var, min, max
);

BenchStats {
max,
mean,
median,
min,
var,
}
}
3 changes: 3 additions & 0 deletions rten-imageproc/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,8 @@ repository = "https://github.com/robertknight/rten"
[dependencies]
rten-tensor = { path = "../rten-tensor", version = "0.4.0" }

[dev-dependencies]
rten-bench = { path = "../rten-bench" }

[lib]
crate-type = ["lib"]
9 changes: 2 additions & 7 deletions rten-imageproc/src/contours.rs
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ mod tests {
#[test]
#[ignore]
fn bench_find_contours() {
use rten_tensor::test_util::bench_loop;
use rten_bench::run_bench;

// Fill a mask with a grid of rectangular objects.
let mask_h = 1024;
Expand All @@ -465,14 +465,9 @@ mod tests {
}

let n_iters = 100;
let stats = bench_loop(n_iters, || {
run_bench(n_iters, "find_contours", || {
let contours = find_contours(mask.view(), RetrievalMode::External);
assert_eq!(contours.len(), (grid_rows * grid_cols) as usize);
});
println!(
"find_contours {:.3} ms, {:.3} ns/elem",
stats.duration_ms(),
stats.duration_ns() / (mask_h * mask_w * n_iters) as f64
);
}
}
39 changes: 0 additions & 39 deletions rten-tensor/src/test_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,45 +191,6 @@ pub fn eq_with_nans(a: TensorView, b: TensorView) -> bool {
}
}

pub struct BenchStats {
/// Duration in seconds.
duration: f64,
}

const SECS_TO_MS: f64 = 1000.;
const SECS_TO_US: f64 = 1_000_000.;
const SECS_TO_NS: f64 = 1_000_000_000.;

impl BenchStats {
/// Return total duration in milliseconds.
pub fn duration_ms(&self) -> f64 {
self.duration * SECS_TO_MS
}

/// Return total duration in microseconds.
pub fn duration_us(&self) -> f64 {
self.duration * SECS_TO_US
}

/// Return total duration in nanoseconds.
pub fn duration_ns(&self) -> f64 {
self.duration * SECS_TO_NS
}
}

/// A very simple benchmark helper which runs `f` for `n_iters` iterations.
pub fn bench_loop<F: FnMut()>(n_iters: usize, mut f: F) -> BenchStats {
let start = std::time::Instant::now();

for _ in 0..n_iters {
f();
}

BenchStats {
duration: start.elapsed().as_secs_f64(),
}
}

#[cfg(test)]
mod tests {
use super::ApproxEq;
Expand Down
2 changes: 1 addition & 1 deletion src/gemm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -910,6 +910,7 @@ mod tests {
use std::error::Error;
use std::ops::Range;

use rten_bench::run_bench;
use rten_tensor::prelude::*;
use rten_tensor::rng::XorShiftRng;
use rten_tensor::test_util::expect_equal;
Expand All @@ -919,7 +920,6 @@ mod tests {
add_scaled_vector, gemm, round_up, GemmExecutor, GemmInputA, GemmInputB, KernelHint,
VirtualMatrix,
};
use crate::test_util::run_bench;

fn reference_matmul(a: &Tensor, b: &Tensor) -> Tensor {
let [a_rows, _a_cols]: [usize; 2] = a.shape().try_into().expect("input should be a matrix");
Expand Down
3 changes: 0 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,3 @@ mod schema_generated;
// create an abstraction around model execution instead.
#[doc(hidden)]
pub mod model_builder;

#[cfg(test)]
mod test_util;
9 changes: 4 additions & 5 deletions src/ops/conv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1313,10 +1313,11 @@ mod tests {
#[test]
#[ignore]
fn bench_col2im() {
use super::col2im;
use rten_tensor::test_util::bench_loop;
use rten_bench::run_bench;
use rten_tensor::NdTensor;

use super::col2im;

let out_chans = 32;
let in_height = 64;
let in_width = 64;
Expand All @@ -1333,14 +1334,12 @@ mod tests {
&mut rng,
);

let stats = bench_loop(100, || {
run_bench(100, "col2im", || {
col2im(
&mut output.view_mut(),
&columns.view(),
[stride_y, stride_x],
);
});

println!("col2im duration {:3} ms", stats.duration_ms());
}
}
77 changes: 77 additions & 0 deletions src/ops/layout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,7 @@ impl Operator for Unsqueeze {
mod tests {
use std::error::Error;

use rten_bench::run_bench;
use rten_tensor::prelude::*;
use rten_tensor::rng::XorShiftRng;
use rten_tensor::test_util::expect_equal;
Expand Down Expand Up @@ -910,4 +911,80 @@ mod tests {
Some(OpError::InvalidValue("Axes must be unique"))
);
}

#[test]
#[ignore]
fn bench_transpose() {
let mut rng = XorShiftRng::new(1234);

struct Case<'a> {
/// Input shape
shape: &'a [usize],

/// Permutation order (eg. `[1, 0]` for a matrix transpose)
perm: &'a [usize],
}

let cases = [
// No-op transpose
Case {
shape: &[512, 512],
perm: &[0, 1],
},
// Matrix transpose of different sizes
Case {
shape: &[256, 256],
perm: &[1, 0],
},
Case {
shape: &[512, 512],
perm: &[1, 0],
},
Case {
shape: &[1024, 1024],
perm: &[1, 0],
},
// Transpose ops taken from Whisper encoder (base model) with 4
// batches of samples
Case {
shape: &[4, 1500, 8, 64],
perm: &[0, 2, 1, 3],
},
Case {
shape: &[4, 8, 1500, 64],
perm: &[0, 2, 1, 3],
},
// Transpose ops taken from Whisper decoder (base model)
Case {
shape: &[1, 1500, 8, 64],
perm: &[0, 2, 3, 1],
},
Case {
shape: &[1, 288, 8, 64],
perm: &[0, 2, 1, 3],
},
];

for Case { shape, perm } in cases {
let tensor = Tensor::rand(shape, &mut rng);

// Do a simple copy. This provides a lower-bound on how fast
// transpose can operate.
let copy_stats = run_bench(100, format!("copy {:?}", shape), || {
tensor.view().to_tensor();
});

let transpose_stats = run_bench(
100,
format!("transpose {:?} perm {:?}", shape, perm),
|| {
transpose(tensor.view(), Some(perm)).unwrap();
},
);

let transpose_overhead =
(transpose_stats.mean - copy_stats.mean).max(0.) / copy_stats.mean;
println!("transpose {:?} overhead {}", shape, transpose_overhead);
}
}
}
2 changes: 1 addition & 1 deletion src/ops/matmul.rs
Original file line number Diff line number Diff line change
Expand Up @@ -226,13 +226,13 @@ impl Operator for MatMul {
mod tests {
use std::error::Error;

use rten_bench::run_bench;
use rten_tensor::prelude::*;
use rten_tensor::rng::XorShiftRng;
use rten_tensor::test_util::expect_equal;
use rten_tensor::{Tensor, TensorView, TensorViewMut};

use crate::gemm::gemm;
use crate::test_util::run_bench;

use super::{gemm_op, matmul, matmul_impl, MatmulStrategy, OpError};

Expand Down
Loading

0 comments on commit ff8d22d

Please sign in to comment.