From bf73c6f595cd291e2d38389f3461ff2b8fa28538 Mon Sep 17 00:00:00 2001 From: Robert Knight Date: Fri, 15 Nov 2024 21:52:50 +0000 Subject: [PATCH] Optimize copying of non-contiguous tensors with 5+ dimensions Improve code path for tensors with 5+ dimensions in `TensorBase::init_from`. Instead of falling back to slow iteration via `TensorBase::iter`, iterate over inner views of 4 dims and use the faster code path that handles this. --- rten-tensor/src/copy.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/rten-tensor/src/copy.rs b/rten-tensor/src/copy.rs index 54e1a82b..88321119 100644 --- a/rten-tensor/src/copy.rs +++ b/rten-tensor/src/copy.rs @@ -182,9 +182,14 @@ pub fn copy_into_slice<'a, T: Clone>( src.merge_axes(); if src.ndim() > 4 { - for (dst, src) in dest.iter_mut().zip(src.iter()) { - dst.write(src.clone()); + let chunk_size = src.shape()[src.ndim() - 4..].iter().product(); + let mut n_init = 0; + for (src, dest) in src.inner_iter::<4>().zip(dest.chunks_mut(chunk_size)) { + copy_into_slice(src.as_dyn(), dest); + n_init += chunk_size; } + assert!(n_init == dest.len()); + // Safety: Loop above initialized all elements of `dest`. return unsafe { transmute::<&mut [MaybeUninit], &[T]>(dest) }; }