Skip to content

Commit

Permalink
Correct implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
Aditya ujeniya committed Sep 16, 2024
1 parent 32823b1 commit fd014fe
Show file tree
Hide file tree
Showing 9 changed files with 64 additions and 59 deletions.
10 changes: 4 additions & 6 deletions .cargo/config.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
# For generic x86_64 architecture.
[x86-64-unknown-linux-gnu]
[target.x86-64-unknown-linux-gnu]
rustflags = [
"-C",
"target-cpu=x86-64-v4",
"-C",
"target-feature=+fma,+avx2,+avx",
"-C",
"llvm-args=-ffast-math",
"target-cpu=native",
"-C",
"opt-level=3",
"-C",
"llvm-args=\"--pass-remarks=.*vector.* --pass-remarks-analysis=.*vector.*\"",
]

# Use +avx512 to enable avx512.
Expand Down
23 changes: 0 additions & 23 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ edition = "2021"

[dependencies]
clap = { version = "4.5.13", features = ["derive"] }
num_cpus = "1.16.0"
rayon = "1.10.0"

[[bin]]
Expand All @@ -14,5 +13,6 @@ path = "src/main.rs"

[profile.release]
# debug = true
# lto = true
lto = "fat"
codegen-units = 1
incremental = false
Binary file modified bench
Binary file not shown.
10 changes: 7 additions & 3 deletions src/kernels/copy.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use std::time::Instant;

use rayon::{iter::ParallelIterator, slice::ParallelSliceMut};
use rayon::{
iter::{IndexedParallelIterator, ParallelIterator},
slice::{ParallelSlice, ParallelSliceMut},
};

#[allow(clippy::ptr_arg, clippy::manual_memcpy, unused_variables)]
#[inline(never)]
Expand All @@ -9,6 +12,7 @@ pub fn copy(c: &mut [f64], a: &[f64], n: usize, block_size: usize) -> f64 {
let a = &a[..n];

let c_iter = c.par_chunks_mut(block_size);
let a_iter = a.par_chunks(block_size);

let s = Instant::now();

Expand All @@ -18,11 +22,11 @@ pub fn copy(c: &mut [f64], a: &[f64], n: usize, block_size: usize) -> f64 {
// }

// Parallel version
c_iter.for_each(|c_slice| {
c_iter.zip(a_iter).for_each(|(c_slice, a_slice)| {
c_slice
.iter_mut()
.enumerate()
.for_each(|(i, val)| *val = a[i])
.for_each(|(i, val)| *val = a_slice[i])
});

s.elapsed().as_secs_f64()
Expand Down
12 changes: 8 additions & 4 deletions src/kernels/daxpy.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
use std::time::Instant;

use rayon::{iter::ParallelIterator, slice::ParallelSliceMut};
use rayon::{
iter::{IndexedParallelIterator, ParallelIterator},
slice::{ParallelSlice, ParallelSliceMut},
};

#[allow(clippy::ptr_arg, unused_variables)]
#[inline(never)]
pub fn daxpy(a: &mut [f64], b: &[f64], scalar: f64, n: usize, block_size: usize) -> f64 {
let a = &mut a[..n];
let b = &b[..n];

let a_iter = a.par_chunks_mut(block_size);
let b_iter = b.par_chunks(block_size);

let s = Instant::now();

Expand All @@ -18,11 +22,11 @@ pub fn daxpy(a: &mut [f64], b: &[f64], scalar: f64, n: usize, block_size: usize)
// }

// Parallel version
a_iter.for_each(|a_slice| {
a_iter.zip(b_iter).for_each(|(a_slice, b_slice)| {
a_slice
.iter_mut()
.enumerate()
.for_each(|(i, val)| *val = b[i].mul_add(scalar, *val))
.for_each(|(i, val)| *val = b_slice[i].mul_add(scalar, *val))
});

s.elapsed().as_secs_f64()
Expand Down
21 changes: 14 additions & 7 deletions src/kernels/sdaxpy.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use std::time::Instant;

use rayon::{iter::ParallelIterator, slice::ParallelSliceMut};
use rayon::{
iter::{IndexedParallelIterator, ParallelIterator},
slice::{ParallelSlice, ParallelSliceMut},
};

#[allow(clippy::ptr_arg, unused_variables)]
pub fn sdaxpy(a: &mut [f64], b: &[f64], c: &[f64], n: usize, block_size: usize) -> f64 {
Expand All @@ -9,6 +12,8 @@ pub fn sdaxpy(a: &mut [f64], b: &[f64], c: &[f64], n: usize, block_size: usize)
let c = &c[..n];

let a_iter = a.par_chunks_mut(block_size);
let b_iter = b.par_chunks(block_size);
let c_iter = c.par_chunks(block_size);

let s = Instant::now();

Expand All @@ -18,12 +23,14 @@ pub fn sdaxpy(a: &mut [f64], b: &[f64], c: &[f64], n: usize, block_size: usize)
// }

// Parallel version
a_iter.for_each(|a_slice| {
a_slice
.iter_mut()
.enumerate()
.for_each(|(i, val)| *val += c[i] * b[i])
});
a_iter
.zip((b_iter, c_iter))
.for_each(|(a_slice, (b_slice, c_slice))| {
a_slice
.iter_mut()
.enumerate()
.for_each(|(i, val)| *val += c_slice[i] * b_slice[i])
});

s.elapsed().as_secs_f64()
}
22 changes: 15 additions & 7 deletions src/kernels/striad.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use std::time::Instant;

use rayon::{iter::ParallelIterator, slice::ParallelSliceMut};
use rayon::{
iter::{IndexedParallelIterator, ParallelIterator},
slice::{ParallelSlice, ParallelSliceMut},
};

#[allow(clippy::ptr_arg, unused_variables)]
pub fn striad(a: &mut [f64], b: &[f64], c: &[f64], d: &[f64], n: usize, block_size: usize) -> f64 {
Expand All @@ -10,6 +13,9 @@ pub fn striad(a: &mut [f64], b: &[f64], c: &[f64], d: &[f64], n: usize, block_si
let d = &d[..n];

let a_iter = a.par_chunks_mut(block_size);
let b_iter = b.par_chunks(block_size);
let c_iter = c.par_chunks(block_size);
let d_iter = d.par_chunks(block_size);

let s = Instant::now();

Expand All @@ -19,11 +25,13 @@ pub fn striad(a: &mut [f64], b: &[f64], c: &[f64], d: &[f64], n: usize, block_si
// }

// Parallel version
a_iter.for_each(|a_slice| {
a_slice
.iter_mut()
.enumerate()
.for_each(|(i, val)| *val = c[i] * d[i] + b[i])
});
a_iter
.zip((b_iter, c_iter, d_iter))
.for_each(|(a_slice, (b_slice, c_slice, d_slice))| {
a_slice
.iter_mut()
.enumerate()
.for_each(|(i, val)| *val = c_slice[i] * d_slice[i] + b_slice[i])
});
s.elapsed().as_secs_f64()
}
21 changes: 14 additions & 7 deletions src/kernels/triad.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use std::time::Instant;

use rayon::{iter::ParallelIterator, slice::ParallelSliceMut};
use rayon::{
iter::{IndexedParallelIterator, ParallelIterator},
slice::{ParallelSlice, ParallelSliceMut},
};

#[allow(clippy::ptr_arg, unused_variables)]
#[inline(never)]
Expand All @@ -10,6 +13,8 @@ pub fn triad(a: &mut [f64], b: &[f64], c: &[f64], scalar: f64, n: usize, block_s
let c = &c[..n];

let a_iter = a.par_chunks_mut(block_size);
let b_iter = b.par_chunks(block_size);
let c_iter = c.par_chunks(block_size);

let s = Instant::now();

Expand All @@ -19,12 +24,14 @@ pub fn triad(a: &mut [f64], b: &[f64], c: &[f64], scalar: f64, n: usize, block_s
// }

// Parallel version
a_iter.for_each(|a_slice| {
a_slice
.iter_mut()
.enumerate()
.for_each(|(i, val)| *val = c[i] * scalar + b[i])
});
a_iter
.zip((b_iter, c_iter))
.for_each(|(a_slice, (b_slice, c_slice))| {
a_slice
.iter_mut()
.enumerate()
.for_each(|(i, val)| *val = c_slice[i] * scalar + b_slice[i])
});

s.elapsed().as_secs_f64()
}

0 comments on commit fd014fe

Please sign in to comment.