Skip to content

Commit

Permalink
PR fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
vilukissa68 committed Dec 16, 2024
1 parent 82ad92d commit 766c845
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 10 deletions.
8 changes: 3 additions & 5 deletions examples/hpc/dla-driver-ffi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use dla_driver::tensor3::{Order3, Tensor3};
use dla_driver::tensor4::{Order4, Tensor4};
use dla_driver::utils::optimal_pp_bias_heuristic;
use dla_driver::{Padding, Stride};
use headsail_bsp::init_heap;

/// Converts C-types to DLA Tensors for use with the highlevel layer
#[allow(clippy::too_many_arguments)]
Expand Down Expand Up @@ -72,7 +73,8 @@ unsafe fn ffi_data_import(
/// Initializes DLA by setting up necessary heap allocator from headsail-bsp. This should be called only once in the program.
#[no_mangle]
pub unsafe extern "C" fn dla_init() {
headsail_bsp::init_heap();
// SAFETY: `init_heap` must be called once only
unsafe { init_heap() };
}

/// Executes Conv2D on DLA with given parameters and writes result to output buffer.
Expand Down Expand Up @@ -444,8 +446,6 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_bias(
None,
);

let _input_order_string = unsafe { CStr::from_ptr(input_order).to_str().unwrap_unchecked() };

// TVM requantization and clip
// NOTE:(20240927 [email protected]) on DLA clipping behaviour with TVM.
// DLA's conv2d arithmetic is done at 16 bit width, but the output of the DLA is limited to 8 bits.
Expand Down Expand Up @@ -540,8 +540,6 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_grouped_bias(
groups,
);

let _input_order_string = unsafe { CStr::from_ptr(input_order).to_str().unwrap_unchecked() };

// TVM requantization and clip
// NOTE:(20240927 [email protected]) on DLA clipping behaviour with TVM.
// DLA's conv2d arithmetic is done at 16 bit width, but the output of the DLA is limited to 8 bits.
Expand Down
4 changes: 4 additions & 0 deletions examples/hpc/dla-driver/examples/depthwise.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ use alloc::vec::Vec;

fn conv_test() {
sprintln!("conv_test: enter");

#[rustfmt:skip]
let din: Vec<i8> = vec![
1, 2, 3, 4, 5,
1, 2, 3, 4, 5,
Expand All @@ -39,6 +41,8 @@ fn conv_test() {
2, 4, 5, 1, 3,
2, 4, 5, 1, 3,
];

#[rustfmt:skip]
let wgt: Vec<i8> = vec![
1,2,3,
4,5,6,
Expand Down
89 changes: 86 additions & 3 deletions examples/hpc/dla-driver/src/layers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,20 @@ pub fn dense(outputs: usize, input: Tensor3<i8>, weights: Vec<i8>) -> Vec<i32> {
output.to_buffer()
}

/// Performs a 2D convolution operation with DLA.
///
/// # Arguments
/// - `input`: A 3-dimensional tensor of 8-bit signed integers (`Tensor3<i8>`) representing the input feature map.
/// - `kernels`: A 4-dimensional tensor of 8-bit signed integers (`Tensor4<i8>`) representing the convolution kernels.
/// - `padding`: An optional `Padding` parameter defining the padding strategy applied to the input.
/// - `stride`: An optional `Stride` parameter defining the stride of the convolution in X and Y directions.
/// - `mac_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after Conv2D operations.
/// - `pp_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after post-processign pipeline.
/// - `simd_mode`: An optional `SimdBitMode` to control which SIMD instruction is used.
///
/// # Returns
/// - A 3-dimensional tensor of type `T` representing the output of the convolution operation.
/// ```
pub fn conv2d<T: DlaOutput + Clone>(
input: Tensor3<i8>,
kernels: Tensor4<i8>,
Expand Down Expand Up @@ -116,7 +130,20 @@ pub fn bias(input: Tensor3<i8>, bias: Vec<i16>, pp_clip: Option<u32>) -> Tensor3
Some(SimdBitMode::EightBits),
)
}

/// Performs a 2D convolution + ReLU operation with DLA.
///
/// # Arguments
/// - `input`: A 3-dimensional tensor of 8-bit signed integers (`Tensor3<i8>`) representing the input feature map.
/// - `kernels`: A 4-dimensional tensor of 8-bit signed integers (`Tensor4<i8>`) representing the convolution kernels.
/// - `padding`: An optional `Padding` parameter defining the padding strategy applied to the input.
/// - `stride`: An optional `Stride` parameter defining the stride of the convolution in X and Y directions.
/// - `mac_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after Conv2D operations.
/// - `pp_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after post-processign pipeline.
/// - `simd_mode`: An optional `SimdBitMode` to control which SIMD instruction is used.
///
/// # Returns
/// - A 3-dimensional tensor of type `T` representing the output of the convolution operation.
/// ```
pub fn conv2d_relu<T: DlaOutput + Clone>(
input: Tensor3<i8>,
kernels: Tensor4<i8>,
Expand All @@ -131,6 +158,21 @@ pub fn conv2d_relu<T: DlaOutput + Clone>(
)
}

/// Performs a 2D convolution + Bias operation with DLA.
///
/// # Arguments
/// - `input`: A 3-dimensional tensor of 8-bit signed integers (`Tensor3<i8>`) representing the input feature map.
/// - `kernels`: A 4-dimensional tensor of 8-bit signed integers (`Tensor4<i8>`) representing the convolution kernels.
/// - `bias`: A vector of 16-bit signed integers containing biases for each channel.
/// - `padding`: An optional `Padding` parameter defining the padding strategy applied to the input.
/// - `stride`: An optional `Stride` parameter defining the stride of the convolution in X and Y directions.
/// - `mac_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after Conv2D operations.
/// - `pp_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after post-processign pipeline.
/// - `simd_mode`: An optional `SimdBitMode` to control which SIMD instruction is used.
///
/// # Returns
/// - A 3-dimensional tensor of type `T` representing the output of the convolution operation.
/// ```
pub fn conv2d_bias<T: DlaOutput + Clone>(
input: Tensor3<i8>,
kernels: Tensor4<i8>,
Expand All @@ -154,7 +196,21 @@ pub fn conv2d_bias<T: DlaOutput + Clone>(
simd_mode,
)
}

/// Performs a 2D convolution + Bias + ReLU operation with DLA.
///
/// # Arguments
/// - `input`: A 3-dimensional tensor of 8-bit signed integers (`Tensor3<i8>`) representing the input feature map.
/// - `kernels`: A 4-dimensional tensor of 8-bit signed integers (`Tensor4<i8>`) representing the convolution kernels.
/// - `bias`: A vector of 16-bit signed integers containing biases for each channel.
/// - `padding`: An optional `Padding` parameter defining the padding strategy applied to the input.
/// - `stride`: An optional `Stride` parameter defining the stride of the convolution in X and Y directions.
/// - `mac_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after Conv2D operations.
/// - `pp_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after post-processign pipeline.
/// - `simd_mode`: An optional `SimdBitMode` to control which SIMD instruction is used.
///
/// # Returns
/// - A 3-dimensional tensor of type `T` representing the output of the convolution operation.
/// ```
pub fn conv2d_bias_relu<T: DlaOutput + Clone>(
input: Tensor3<i8>,
kernels: Tensor4<i8>,
Expand All @@ -179,6 +235,33 @@ pub fn conv2d_bias_relu<T: DlaOutput + Clone>(
)
}

/// Performs a 2D grouped convolution + Bias operation with DLA.
///
/// # Arguments
/// - `input`: A 3-dimensional tensor of 8-bit signed integers (`Tensor3<i8>`) representing the input feature map.
/// - `kernels`: A 4-dimensional tensor of 8-bit signed integers (`Tensor4<i8>`) representing the convolution kernels.
/// - `bias`: A vector of 16-bit signed integers containing biases for each channel.
/// - `padding`: An optional `Padding` parameter defining the padding strategy applied to the input.
/// - `stride`: An optional `Stride` parameter defining the stride of the convolution in X and Y directions.
/// - `mac_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after Conv2D operations.
/// - `pp_clip`: An optional 32-bit unsigned integer (`u32`) specifying the amount of clipping after post-processign pipeline.
/// - `simd_mode`: An optional `SimdBitMode` to control which SIMD instruction is used.
/// - `groups`: Number of groups used.
///
/// # Returns
/// - A 3-dimensional tensor of type `T` representing the output of the convolution operation.
///
/// # Notes
/// - The total number of input channels must be divisible by `groups`.
/// - The total number of kernels must also be divisible by `groups`.
/// - Padding and stride configurations are applied consistently across all groups.
///
///# Example
/// For an input tensor with 8 channels, kernels with 16 filters, and `groups = 2`:
/// - The input channels are split into 2 groups of 4 channels each.
/// - Each group processes its portion with 8 filters (16 filters / 2 groups).
/// - The final output will have 16 channels (8 channels per group concatenated).
/// ```
pub fn grouped_conv2d<T: DlaOutput + Clone>(
input: Tensor3<i8>,
kernels: Tensor4<i8>,
Expand Down Expand Up @@ -220,7 +303,7 @@ pub fn grouped_conv2d<T: DlaOutput + Clone>(
}

// Concatenate the output tensors along the channel dimension
Tensor3::concat_interleaved(output_tensors)
Tensor3::concat_interleaved(&output_tensors)
}

fn run_layers<T: DlaOutput + Clone>(
Expand Down
4 changes: 2 additions & 2 deletions examples/hpc/dla-driver/src/tensor3.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ impl<T: Clone> Tensor3<T> {
}

/// Concatenates a Tensor along the least significant axis (axis=2) by interleaving the tensors
pub fn concat_interleaved(tensors: Vec<Tensor3<T>>) -> Tensor3<T> {
pub fn concat_interleaved(tensors: &[Tensor3<T>]) -> Tensor3<T> {
let _target_order = tensors[0].order();
let (height, width, channels) = (
tensors[0].height(),
Expand All @@ -196,7 +196,7 @@ impl<T: Clone> Tensor3<T> {
for h in 0..height {
for w in 0..width {
for c in 0..channels {
for tensor in &tensors {
for tensor in tensors {
intermediary_buffer.push(tensor.data[(h, w, c)].clone());
}
}
Expand Down

0 comments on commit 766c845

Please sign in to comment.