PR fixes

soc-hub-fi · Dec 16, 2024 · 766c845 · 766c845
1 parent 82ad92d
commit 766c845
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 10 deletions.
diff --git a/examples/hpc/dla-driver-ffi/src/lib.rs b/examples/hpc/dla-driver-ffi/src/lib.rs
@@ -13,6 +13,7 @@ use dla_driver::tensor3::{Order3, Tensor3};
 use dla_driver::tensor4::{Order4, Tensor4};
 use dla_driver::utils::optimal_pp_bias_heuristic;
 use dla_driver::{Padding, Stride};
+use headsail_bsp::init_heap;
 
 /// Converts C-types to DLA Tensors for use with the highlevel layer
 #[allow(clippy::too_many_arguments)]
@@ -72,7 +73,8 @@ unsafe fn ffi_data_import(
 /// Initializes DLA by setting up necessary heap allocator from headsail-bsp. This should be called only once in the program.
 #[no_mangle]
 pub unsafe extern "C" fn dla_init() {
-    headsail_bsp::init_heap();
+    // SAFETY: `init_heap` must be called once only
+    unsafe { init_heap() };
 }
 
 /// Executes Conv2D on DLA with given parameters and writes result to output buffer.
@@ -444,8 +446,6 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_bias(
         None,
     );
 
-    let _input_order_string = unsafe { CStr::from_ptr(input_order).to_str().unwrap_unchecked() };
-
     // TVM requantization and clip
     // NOTE:(20240927 [email protected]) on DLA clipping behaviour with TVM.
     // DLA's conv2d arithmetic is done at 16 bit width, but the output of the DLA is limited to 8 bits.
@@ -540,8 +540,6 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_grouped_bias(
         groups,
     );
 
-    let _input_order_string = unsafe { CStr::from_ptr(input_order).to_str().unwrap_unchecked() };
-
     // TVM requantization and clip
     // NOTE:(20240927 [email protected]) on DLA clipping behaviour with TVM.
     // DLA's conv2d arithmetic is done at 16 bit width, but the output of the DLA is limited to 8 bits.

diff --git a/examples/hpc/dla-driver/examples/depthwise.rs b/examples/hpc/dla-driver/examples/depthwise.rs
@@ -14,6 +14,8 @@ use alloc::vec::Vec;
 
 fn conv_test() {
     sprintln!("conv_test: enter");
+
+	#[rustfmt:skip]
     let din: Vec<i8> = vec![
         1, 2, 3, 4, 5,
 		1, 2, 3, 4, 5,
@@ -39,6 +41,8 @@ fn conv_test() {
 		2, 4, 5, 1, 3,
 		2, 4, 5, 1, 3,
     ];
+
+	#[rustfmt:skip]
     let wgt: Vec<i8> = vec![
 		1,2,3,
 		4,5,6,

diff --git a/examples/hpc/dla-driver/src/layers.rs b/examples/hpc/dla-driver/src/layers.rs
@@ -51,6 +51,20 @@ pub fn dense(outputs: usize, input: Tensor3<i8>, weights: Vec<i8>) -> Vec<i32> {
     output.to_buffer()
 }
 
+/// Performs a 2D convolution operation with DLA.
+///
+/// # Arguments
+/// - `input`: A 3-dimensional tensor of 8-bit signed integers (`Tensor3<i8>`) representing the input feature map.
+/// - `kernels`: A 4-dimensional tensor of 8-bit signed integers (`Tensor4<i8>`) representing the convolution kernels.
+/// - `padding`: An optional `Padding` parameter defining the padding strategy applied to the input.
+/// - `stride`: An optional `Stride` parameter defining the stride of the convolution in X and Y directions.
+/// - `mac_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after Conv2D operations.
+/// - `pp_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after post-processign pipeline.
+/// - `simd_mode`: An optional `SimdBitMode` to control which SIMD instruction is used.
+///
+/// # Returns
+/// - A 3-dimensional tensor of type `T` representing the output of the convolution operation.
+/// ```
 pub fn conv2d<T: DlaOutput + Clone>(
     input: Tensor3<i8>,
     kernels: Tensor4<i8>,
@@ -116,7 +130,20 @@ pub fn bias(input: Tensor3<i8>, bias: Vec<i16>, pp_clip: Option<u32>) -> Tensor3
         Some(SimdBitMode::EightBits),
     )
 }
-
+/// Performs a 2D convolution + ReLU operation with DLA.
+///
+/// # Arguments
+/// - `input`: A 3-dimensional tensor of 8-bit signed integers (`Tensor3<i8>`) representing the input feature map.
+/// - `kernels`: A 4-dimensional tensor of 8-bit signed integers (`Tensor4<i8>`) representing the convolution kernels.
+/// - `padding`: An optional `Padding` parameter defining the padding strategy applied to the input.
+/// - `stride`: An optional `Stride` parameter defining the stride of the convolution in X and Y directions.
+/// - `mac_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after Conv2D operations.
+/// - `pp_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after post-processign pipeline.
+/// - `simd_mode`: An optional `SimdBitMode` to control which SIMD instruction is used.
+///
+/// # Returns
+/// - A 3-dimensional tensor of type `T` representing the output of the convolution operation.
+/// ```
 pub fn conv2d_relu<T: DlaOutput + Clone>(
     input: Tensor3<i8>,
     kernels: Tensor4<i8>,
@@ -131,6 +158,21 @@ pub fn conv2d_relu<T: DlaOutput + Clone>(
     )
 }
 
+/// Performs a 2D convolution + Bias operation with DLA.
+///
+/// # Arguments
+/// - `input`: A 3-dimensional tensor of 8-bit signed integers (`Tensor3<i8>`) representing the input feature map.
+/// - `kernels`: A 4-dimensional tensor of 8-bit signed integers (`Tensor4<i8>`) representing the convolution kernels.
+/// - `bias`: A vector of 16-bit signed integers containing biases for each channel.
+/// - `padding`: An optional `Padding` parameter defining the padding strategy applied to the input.
+/// - `stride`: An optional `Stride` parameter defining the stride of the convolution in X and Y directions.
+/// - `mac_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after Conv2D operations.
+/// - `pp_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after post-processign pipeline.
+/// - `simd_mode`: An optional `SimdBitMode` to control which SIMD instruction is used.
+///
+/// # Returns
+/// - A 3-dimensional tensor of type `T` representing the output of the convolution operation.
+/// ```
 pub fn conv2d_bias<T: DlaOutput + Clone>(
     input: Tensor3<i8>,
     kernels: Tensor4<i8>,
@@ -154,7 +196,21 @@ pub fn conv2d_bias<T: DlaOutput + Clone>(
         simd_mode,
     )
 }
-
+/// Performs a 2D convolution + Bias + ReLU operation with DLA.
+///
+/// # Arguments
+/// - `input`: A 3-dimensional tensor of 8-bit signed integers (`Tensor3<i8>`) representing the input feature map.
+/// - `kernels`: A 4-dimensional tensor of 8-bit signed integers (`Tensor4<i8>`) representing the convolution kernels.
+/// - `bias`: A vector of 16-bit signed integers containing biases for each channel.
+/// - `padding`: An optional `Padding` parameter defining the padding strategy applied to the input.
+/// - `stride`: An optional `Stride` parameter defining the stride of the convolution in X and Y directions.
+/// - `mac_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after Conv2D operations.
+/// - `pp_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after post-processign pipeline.
+/// - `simd_mode`: An optional `SimdBitMode` to control which SIMD instruction is used.
+///
+/// # Returns
+/// - A 3-dimensional tensor of type `T` representing the output of the convolution operation.
+/// ```
 pub fn conv2d_bias_relu<T: DlaOutput + Clone>(
     input: Tensor3<i8>,
     kernels: Tensor4<i8>,
@@ -179,6 +235,33 @@ pub fn conv2d_bias_relu<T: DlaOutput + Clone>(
     )
 }
 
+/// Performs a 2D grouped convolution + Bias operation with DLA.
+///
+/// # Arguments
+/// - `input`: A 3-dimensional tensor of 8-bit signed integers (`Tensor3<i8>`) representing the input feature map.
+/// - `kernels`: A 4-dimensional tensor of 8-bit signed integers (`Tensor4<i8>`) representing the convolution kernels.
+/// - `bias`: A vector of 16-bit signed integers containing biases for each channel.
+/// - `padding`: An optional `Padding` parameter defining the padding strategy applied to the input.
+/// - `stride`: An optional `Stride` parameter defining the stride of the convolution in X and Y directions.
+/// - `mac_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after Conv2D operations.
+/// - `pp_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after post-processign pipeline.
+/// - `simd_mode`: An optional `SimdBitMode` to control which SIMD instruction is used.
+/// - `groups`: Number of groups used.
+///
+/// # Returns
+/// - A 3-dimensional tensor of type `T` representing the output of the convolution operation.
+///
+/// # Notes
+/// - The total number of input channels must be divisible by `groups`.
+/// - The total number of kernels must also be divisible by `groups`.
+/// - Padding and stride configurations are applied consistently across all groups.
+///
+///# Example
+/// For an input tensor with 8 channels, kernels with 16 filters, and `groups = 2`:
+/// - The input channels are split into 2 groups of 4 channels each.
+/// - Each group processes its portion with 8 filters (16 filters / 2 groups).
+/// - The final output will have 16 channels (8 channels per group concatenated).
+/// ```
 pub fn grouped_conv2d<T: DlaOutput + Clone>(
     input: Tensor3<i8>,
     kernels: Tensor4<i8>,
@@ -220,7 +303,7 @@ pub fn grouped_conv2d<T: DlaOutput + Clone>(
     }
 
     // Concatenate the output tensors along the channel dimension
-    Tensor3::concat_interleaved(output_tensors)
+    Tensor3::concat_interleaved(&output_tensors)
 }
 
 fn run_layers<T: DlaOutput + Clone>(

diff --git a/examples/hpc/dla-driver/src/tensor3.rs b/examples/hpc/dla-driver/src/tensor3.rs
@@ -184,7 +184,7 @@ impl<T: Clone> Tensor3<T> {
     }
 
     /// Concatenates a Tensor along the least significant axis (axis=2) by interleaving the tensors
-    pub fn concat_interleaved(tensors: Vec<Tensor3<T>>) -> Tensor3<T> {
+    pub fn concat_interleaved(tensors: &[Tensor3<T>]) -> Tensor3<T> {
         let _target_order = tensors[0].order();
         let (height, width, channels) = (
             tensors[0].height(),
@@ -196,7 +196,7 @@ impl<T: Clone> Tensor3<T> {
         for h in 0..height {
             for w in 0..width {
                 for c in 0..channels {
-                    for tensor in &tensors {
+                    for tensor in tensors {
                         intermediary_buffer.push(tensor.data[(h, w, c)].clone());
                     }
                 }