From 8823714bd31afdfde53d5f761a583991d9e18f6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=A4in=C3=B6=20Granat?= <vaino-waltteri.granat@tuni.fi>
Date: Tue, 10 Dec 2024 11:19:13 +0200
Subject: [PATCH] Fix rebase changes, update documentation

---
 examples/hpc-c/tvm-hpc/README.md              |  20 ++--
 examples/hpc-c/tvm-hpc/tiny_perf_benchmark.py | 102 ++++++++++++------
 examples/hpc/dla-driver-ffi/src/lib.rs        |  44 ++++----
 examples/hpc/dla-driver/src/tensor3.rs        |  20 +---
 examples/hpc/dla-driver/src/utils.rs          |   1 -
 5 files changed, 102 insertions(+), 85 deletions(-)

diff --git a/examples/hpc-c/tvm-hpc/README.md b/examples/hpc-c/tvm-hpc/README.md
index 4aa72741..bd6b495e 100644
--- a/examples/hpc-c/tvm-hpc/README.md
+++ b/examples/hpc-c/tvm-hpc/README.md
@@ -2,7 +2,7 @@
 This build is only tested on python3.11.
 
 # Installing dependencies
-## Installing TVM
+## Installing TVM with Headsail backend
 Get sources
 ``` sh
 git clone --recursive --depth=1 https://github.com/soc-hub-fi/headsail-tvm tvm
@@ -19,7 +19,6 @@ cp <config.cmake-in-tvm-hpc-directory> <path-to-tvm-repository>/build/config.cma
 For example
 ``` sh
 cp headsail-vp/examples/hpc-c/tvm-hpc/config.cmake tvm/build/config.cmake
-
 ```
 
 To enable codegen modify config.cmake file in the build directory by setting line 162 value to pointing at llvm-config.
@@ -31,7 +30,7 @@ set(USE_HEADSAIL ON)
 
 ### Building TVM
 
-Build in the previously greated build directory
+Build in the previously created build directory
 ``` sh
 cd build
 cmake ..
@@ -49,7 +48,7 @@ More information in https://tvm.apache.org/docs/install/from_source.html
 
 
 ## Python dependencies
-Python dependencies are needed for building TVM models from onnx graphs and must be available during tvm-hpc compilation. 
+Python dependencies are needed for building TVM models from TFLite graphs and must be available during tvm-hpc compilation. 
 
 Install python dependencies for TVM
 ``` sh
@@ -67,7 +66,7 @@ pip install -r requirements.txt
 # Building project
 
 ## Fetching the datasets 
-To run Tinyperf benchmark we need to obtain the needed datasets. Easiest way to do this is by runnning the `get_testing_data` script.
+To run Tinyperf benchmark we need to obtain the needed datasets. Easiest way to do this is by running the `get_testing_data` script.
 ```sh
 ./get_testing_data.sh
 ```
@@ -77,13 +76,14 @@ In project folder (tvm-hpc)
 ```sh
 mkdir build
 cd build
-cmake ..
+cmake .. -DUSE_PERF_KEYWORD_SPOTTING=ON -DUSE_ACCELERATOR=ON
 make
 ```
-This creates a binary called headsail-tvm
+This creates a binary called headsail-tvm with model for MLPerf Tiny Keyword Spotting task embedded, with convolutions assigned for the DLA.
+Other options for models are: `[-DUSE_PERF_IMAGE_CLASSIFICATION=ON,-DUSE_PERF_VISUAL_WAKE_WORDS=ON]`. The use of accelerator is controlled with the `-DUSE_ACCELERATOR=[ON/OFF]` flag.
 
-# Running in renode
-After succesful build, the resulting binary can be run with Headsail's virtual prototype in Renode
+# Running in Renode
+After successful build, the resulting binary can be run with Headsail's virtual prototype in Renode
 ```sh
 cd /headsail-vp/scripts
 ./run_on_hpc.sh ../examples/hpc-c/tvm-hpc/build/headsail-tvm
@@ -92,5 +92,5 @@ cd /headsail-vp/scripts
 ## Running the benchmark 
 To run the TinyPerf benchmark run the tiny_perf_benchmark.py script with the `-b` options with the wanted benchmark `[ic, kws, vww]`. 
 ```sh
-python tiny_perf_benchmark.py -b ic
+python tiny_perf_benchmark.py -b kws
 ```
diff --git a/examples/hpc-c/tvm-hpc/tiny_perf_benchmark.py b/examples/hpc-c/tvm-hpc/tiny_perf_benchmark.py
index c3360423..d4c7d594 100644
--- a/examples/hpc-c/tvm-hpc/tiny_perf_benchmark.py
+++ b/examples/hpc-c/tvm-hpc/tiny_perf_benchmark.py
@@ -15,36 +15,40 @@
 import time
 import numpy as np
 
-UART = '/tmp/uart0'
+UART = "/tmp/uart0"
 ROOT_PATH = Path(__file__).parents[0]
 DATA_DIR = ROOT_PATH / "dev_data"
 KWS_DATA_DIR = DATA_DIR / "kws01"
 VWW_DATA_DIR = DATA_DIR / "vw_coco2014_96"
-VWW_NON_PERSON_DATA_DIR =  VWW_DATA_DIR / "non_person"
+VWW_NON_PERSON_DATA_DIR = VWW_DATA_DIR / "non_person"
 VWW_PERSON_DATA_DIR = VWW_DATA_DIR / "person"
 IC_DATA_DIR = DATA_DIR / "cifar-10-batches-py"
 AD_DATA_DIR = DATA_DIR / "ToyCar" / "test"
 
 
 # UTILS
-def print_matrix(arr, format_type='signed'):
+def print_matrix(arr, format_type="signed"):
     for row in arr:
         for elem in row:
-            if format_type == 'signed':
+            if format_type == "signed":
                 print(f"{int(elem):d}", end="\t")  # Signed decimal
-            elif format_type == 'unsigned':
+            elif format_type == "unsigned":
                 print(f"{int(elem) & 0xFFFFFFFF:d}", end="\t")  # Unsigned decimal
-            elif format_type == 'hex':
+            elif format_type == "hex":
                 print(f"{int(elem) & 0xFFFFFFFF:08x}", end="\t")  # Hexadecimal
             else:
-                raise ValueError("Invalid format_type. Use 'signed', 'unsigned', or 'hex'.")
+                raise ValueError(
+                    "Invalid format_type. Use 'signed', 'unsigned', or 'hex'."
+                )
         print()
 
+
 def accuracy_report(gt, prediction):
     print("Accuracy: {:.3f}".format(accuracy_score(gt, prediction)))
     print("Confusion matrix:\n{}".format(confusion_matrix(gt, prediction)))
     print(classification_report(gt, prediction))
 
+
 def send_stimulus(data, label=None):
     print("Writing {} bytes as stimulus...".format(len(data)))
     if label is not None:
@@ -54,19 +58,20 @@ def send_stimulus(data, label=None):
     ser.write(bytes(data))
     ser.close()
 
+
 def wait_for_result():
     print("Waiting for results...")
-    ser = serial.Serial(UART,  9600)
+    ser = serial.Serial(UART, 9600)
     output = ser.readline()
-    while output != b'Prediction:\n':
+    while output != b"Prediction:\n":
         output = ser.readline()
     output = ser.readline()
     ser.close()
     output = bytearray(output)
     results = []
     for x in output:
-        results.append(((x & 0xff) ^ 0x80) - 0x80) # Append signed
-    results = results[:-1] # Remove line break
+        results.append(((x & 0xFF) ^ 0x80) - 0x80)  # Append signed
+    results = results[:-1]  # Remove line break
     print(results)
     print("Predicted class: {}".format(np.argmax(results)))
     print("\n")
@@ -79,14 +84,20 @@ def read_kws_file(path):
         content = file.read()
     return content
 
+
 def get_kws_stimulus():
-    df = pd.read_csv(KWS_DATA_DIR / "y_labels.csv", names=["filename", "no_classes", "class"])
+    df = pd.read_csv(
+        KWS_DATA_DIR / "y_labels.csv", names=["filename", "no_classes", "class"]
+    )
     data = read_kws_file(KWS_DATA_DIR / df["filename"][0])
     print("Expected label:", df["class"][0])
     return data
 
+
 def run_kws(total_samples=200):
-    df = pd.read_csv(KWS_DATA_DIR / "y_labels.csv", names=["filename", "no_classes", "class"])
+    df = pd.read_csv(
+        KWS_DATA_DIR / "y_labels.csv", names=["filename", "no_classes", "class"]
+    )
 
     class_counts = df["class"].value_counts()
 
@@ -116,13 +127,13 @@ def run_kws(total_samples=200):
     balanced_df = balanced_df.sample(frac=1, random_state=42)
 
     predictions = []
-    for (i, filename) in enumerate(balanced_df["filename"]):
+    for i, filename in enumerate(balanced_df["filename"]):
         data = read_kws_file(KWS_DATA_DIR / filename)
         send_stimulus(data, df["class"][i])
         predictions.append(np.argmax(wait_for_result()))
 
         # Mid run report
-        accuracy_report(balanced_df["class"][:len(predictions)], predictions)
+        accuracy_report(balanced_df["class"][: len(predictions)], predictions)
 
     print("Final accuracy report for Keyword Spotting:")
     accuracy_report(balanced_df["class"], predictions)
@@ -130,31 +141,53 @@ def run_kws(total_samples=200):
 
 # VWW
 def read_vww_file(path):
-    #Image loading and preprocessing
+    # Image loading and preprocessing
     image = tf.io.read_file(str(path))
     image = tf.image.decode_jpeg(image, channels=3)
-    image = tf.image.resize(image, [96,96])
+    image = tf.image.resize(image, [96, 96])
     image = np.array(image, dtype=np.int8)
     image = image - 128
     return image.astype(np.int8)
 
+
 def get_vww_stimulus():
     items = os.listdir(VWW_NON_PERSON_DATA_DIR)
-    non_persons = [item for item in items if os.path.isfile(os.path.join(VWW_NON_PERSON_DATA_DIR, item)) and item.startswith("COCO_val")]
+    non_persons = [
+        item
+        for item in items
+        if os.path.isfile(os.path.join(VWW_NON_PERSON_DATA_DIR, item))
+        and item.startswith("COCO_val")
+    ]
     data = read_vww_file(VWW_NON_PERSON_DATA_DIR / non_persons[0])
     print("Expected label: 1")
     return data.tobytes()
 
+
 def run_vww(total_samples=100):
     items = os.listdir(VWW_NON_PERSON_DATA_DIR)
-    non_persons = [item for item in items if os.path.isfile(os.path.join(VWW_NON_PERSON_DATA_DIR, item)) and item.startswith("COCO_val")]
+    non_persons = [
+        item
+        for item in items
+        if os.path.isfile(os.path.join(VWW_NON_PERSON_DATA_DIR, item))
+        and item.startswith("COCO_val")
+    ]
+    non_persons.sort()
 
     items = os.listdir(VWW_PERSON_DATA_DIR)
-    persons = [item for item in items if os.path.isfile(os.path.join(VWW_PERSON_DATA_DIR, item)) and item.startswith("COCO_val")]
+    persons = [
+        item
+        for item in items
+        if os.path.isfile(os.path.join(VWW_PERSON_DATA_DIR, item))
+        and item.startswith("COCO_val")
+    ]
+    persons.sort()
 
     print("Number of non_persons", len(non_persons))
     print("Number of persons", len(persons))
-    print("Input shape: ", np.shape(read_vww_file(VWW_NON_PERSON_DATA_DIR / non_persons[0])))
+    print(
+        "Input shape: ",
+        np.shape(read_vww_file(VWW_NON_PERSON_DATA_DIR / non_persons[0])),
+    )
 
     # Calculate balanced number of samples for each category
     samples_per_class = min(len(non_persons), len(persons), total_samples // 2)
@@ -186,22 +219,26 @@ def run_vww(total_samples=100):
     print("Final accuracy report for Visual Wakeup Word:")
     accuracy_report(gt, predictions)
 
+
 # IC
 def get_ic_stimulus():
     import pickle
+
     with open(IC_DATA_DIR / "test_batch", "rb") as file:
-        data = pickle.load(file, encoding='bytes')
-    print("Expected label:", data[b'labels'][0])
-    return data[b'data'][0].tobytes()
+        data = pickle.load(file, encoding="bytes")
+    print("Expected label:", data[b"labels"][0])
+    return data[b"data"][0].tobytes()
+
 
 def run_ic(total_samples=200):
     import pickle
+
     with open(IC_DATA_DIR / "test_batch", "rb") as file:
-        data = pickle.load(file, encoding='bytes')
-    print("Input shape: {}".format(np.shape(data[b'data'][0])))
+        data = pickle.load(file, encoding="bytes")
+    print("Input shape: {}".format(np.shape(data[b"data"][0])))
 
-    images = data[b'data']
-    labels = data[b'labels']
+    images = data[b"data"]
+    labels = data[b"labels"]
     print(labels)
 
     class_samples = {i: [] for i in range(10)}
@@ -234,8 +271,8 @@ def run_ic(total_samples=200):
     print(selected_labels)
 
     # Run inference on samples
-    for (i, image) in enumerate(selected_images):
-        #FROM CHW to HWC
+    for i, image in enumerate(selected_images):
+        # FROM CHW to HWC
         image = np.reshape(image, (3, 32, 32))
         image = np.rollaxis(image, 0, 3)
         image = image - 128
@@ -244,11 +281,11 @@ def run_ic(total_samples=200):
         send_stimulus(image.tobytes(), label)
 
         # Wait for inference result
-        prediction = (np.argmax(wait_for_result()))
+        prediction = np.argmax(wait_for_result())
         predictions.append(prediction)
 
         # Mid-run report
-        accuracy_report(selected_labels[:len(predictions)], predictions)
+        accuracy_report(selected_labels[: len(predictions)], predictions)
 
     print("Final accuracy report for Image Classification:")
     accuracy_report(selected_labels, predictions)
@@ -277,5 +314,6 @@ def main():
     else:
         print("Bad benchmark! Available benchmarks are: kws, vww, ic")
 
+
 if __name__ == "__main__":
     main()
diff --git a/examples/hpc/dla-driver-ffi/src/lib.rs b/examples/hpc/dla-driver-ffi/src/lib.rs
index 5dec0c2b..648bde39 100644
--- a/examples/hpc/dla-driver-ffi/src/lib.rs
+++ b/examples/hpc/dla-driver-ffi/src/lib.rs
@@ -12,8 +12,8 @@ use core::slice;
 use dla_driver::layers::{conv2d, conv2d_bias, conv2d_bias_relu, conv2d_relu, grouped_conv2d};
 use dla_driver::tensor3::{rescale, Order3, Tensor3};
 use dla_driver::tensor4::{Order4, Tensor4};
-use dla_driver::{Padding, Stride};
 use dla_driver::utils::optimal_pp_bias_heuristic;
+use dla_driver::{Padding, Stride};
 
 /// Converts C-types to DLA Tensors for use with the highlevel layer
 unsafe fn ffi_data_import(
@@ -72,7 +72,7 @@ unsafe fn ffi_data_import(
 /// Initializes DLA by setting up necessary heap allocator from headsail-bsp. This should be called only once in the program.
 #[no_mangle]
 pub unsafe extern "C" fn dla_init() {
-    headsail_bsp::init_alloc();
+    headsail_bsp::init_heap();
 }
 
 /// Executes Conv2D on DLA with given parameters and writes result to output buffer.
@@ -412,11 +412,12 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_bias(
     let bias: Vec<i16> = unsafe {
         slice::from_raw_parts(bias as *const i32, bias_length)
             .into_iter()
-            .map(|x| (*x >> 8).clamp(i16::MIN as i32, i16::MAX as i32) as i16)
+            .map(|x| (*x).clamp(i16::MIN as i32, i16::MAX as i32) as i16)
             .collect()
     };
 
-    let optimized_pp = optimal_pp_bias_heuristic(&bias);
+    //let optimized_pp = optimal_pp_bias_heuristic(&bias);
+    let optimized_pp = 7;
 
     let mut result: Tensor3<i8> = conv2d_bias(
         input_tensor,
@@ -446,16 +447,13 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_bias(
     // To comply with TVM's expected value range our solution is to bit shift/clip the 16-bit result of
     // conv2d by 8 bits and shift if back in the driver. This causes some amount of data loss due to
     // the lost granularity of the values. The clipping amount is set by the pp_clip argument.
-    let mut res_i32: Vec<i32> = result.to_buffer()
-                                        .iter().map(|x: &i8| ((*x as i32) << optimized_pp)).collect();
+    let mut res_i32: Vec<i32> = result
+        .to_buffer()
+        .iter()
+        .map(|x: &i8| ((*x as i32) << optimized_pp))
+        .collect();
 
-    unsafe {
-        core::ptr::copy_nonoverlapping(
-            res_i32.as_mut_ptr(),
-            output,
-            result.get_size(),
-        )
-    };
+    unsafe { core::ptr::copy_nonoverlapping(res_i32.as_mut_ptr(), output, result.get_size()) };
 }
 
 /// # Arguments
@@ -509,7 +507,7 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_grouped_bias(
     let bias: Vec<i16> = unsafe {
         slice::from_raw_parts(bias as *const i32, bias_length)
             .into_iter()
-            .map(|x| (*x >> 8).clamp(i16::MIN as i32, i16::MAX as i32) as i16)
+            .map(|x| (*x).clamp(i16::MIN as i32, i16::MAX as i32) as i16)
             .collect()
     };
 
@@ -533,10 +531,9 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_grouped_bias(
         Some(mac_clip),
         Some(optimized_pp),
         None,
-        groups
+        groups,
     );
 
-
     let input_order_string = unsafe { CStr::from_ptr(input_order).to_str().unwrap_unchecked() };
 
     // TVM requantization and clip
@@ -545,14 +542,11 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_grouped_bias(
     // To comply with TVM's expected value range our solution is to bit shift/clip the 16-bit result of
     // conv2d by 8 bits and shift if back in the driver. This causes some amount of data loss due to
     // the lost granularity of the values. The clipping amount is set by the pp_clip argument.
-    let mut res_i32: Vec<i32> = result.to_buffer()
-                                       .iter().map(|x: &i8| (*x as f32 * u32::pow(2, optimized_pp) as f32) as i32).collect();
+    let mut res_i32: Vec<i32> = result
+        .to_buffer()
+        .iter()
+        .map(|x: &i8| ((*x as i32) << optimized_pp))
+        .collect();
 
-    unsafe {
-        core::ptr::copy_nonoverlapping(
-            res_i32.as_mut_ptr(),
-            output,
-            result.get_size(),
-        )
-    };
+    unsafe { core::ptr::copy_nonoverlapping(res_i32.as_mut_ptr(), output, result.get_size()) };
 }
diff --git a/examples/hpc/dla-driver/src/tensor3.rs b/examples/hpc/dla-driver/src/tensor3.rs
index dd55aeb0..ad30c38b 100644
--- a/examples/hpc/dla-driver/src/tensor3.rs
+++ b/examples/hpc/dla-driver/src/tensor3.rs
@@ -1,6 +1,6 @@
 use alloc::vec::*;
 use core::ffi::c_char;
-use ndarray::{s, Array, Array3};
+use ndarray::{Array, Array3, ArrayView3 , Axis, s, stack, concatenate};
 
 #[derive(Clone, Copy, Debug, PartialEq)]
 pub enum Order3 {
@@ -183,27 +183,11 @@ impl<T: Clone> Tensor3<T> {
         self.order
     }
 
-    /// Concate vector of Tensor3 in order to single Tensor3
-    pub fn concat(tensors: Vec<Tensor3<T>>, axis: usize) -> Tensor3<T> {
-        let target_order = tensors[0].order();
-        let arrays: Vec<Array3<T>> = tensors.into_iter().map(|t| t.data).collect();
-        // Concatenate along the specified axis
-        let stacked = concatenate(Axis(axis), &arrays.iter().map(|a| a.view()).collect::<Vec<_>>())
-            .expect("Concatenation failed due to incompatible shapes");
-
-        Tensor3 {
-            data: stacked,
-            order: target_order,
-        }
-    }
-
     /// Concatenates a Tensor along the least significant axis (axis=2) by interleaving the tensors
     pub fn concat_interleaved(tensors: Vec<Tensor3<T>>) -> Tensor3<T> {
         let target_order = tensors[0].order();
-
         let (height, width, channels) = (tensors[0].height(), tensors[0].width(), tensors[0].channels());
         let mut intermediary_buffer: Vec<T> = Vec::with_capacity(height * width * channels * tensors.len());
-
         for h in 0..height {
             for w in 0..width {
                 for c in 0..channels {
@@ -287,6 +271,7 @@ impl<T: Clone> Tensor3<T> {
         data.permute(order);
         data.to_buffer()
     }
+
 }
 
 pub fn rescale(
@@ -316,6 +301,7 @@ pub fn rescale(
              let value = (input_scale / scale) * (*x as f32 * pre_scale - input_zero as f32)
                  + output_zero as f32;
             *x = value.clamp(i8::MIN as f32, i8::MAX as f32) as i8
+
         });
     }
 }
diff --git a/examples/hpc/dla-driver/src/utils.rs b/examples/hpc/dla-driver/src/utils.rs
index 160d4495..deeb646c 100644
--- a/examples/hpc/dla-driver/src/utils.rs
+++ b/examples/hpc/dla-driver/src/utils.rs
@@ -127,5 +127,4 @@ pub fn optimal_pp_bias_heuristic(bias: &Vec<i16>) -> u32 {
         return 8
     }
     pp
-
 }