From 8823714bd31afdfde53d5f761a583991d9e18f6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A4in=C3=B6=20Granat?= Date: Tue, 10 Dec 2024 11:19:13 +0200 Subject: [PATCH] Fix rebase changes, update documentation --- examples/hpc-c/tvm-hpc/README.md | 20 ++-- examples/hpc-c/tvm-hpc/tiny_perf_benchmark.py | 102 ++++++++++++------ examples/hpc/dla-driver-ffi/src/lib.rs | 44 ++++---- examples/hpc/dla-driver/src/tensor3.rs | 20 +--- examples/hpc/dla-driver/src/utils.rs | 1 - 5 files changed, 102 insertions(+), 85 deletions(-) diff --git a/examples/hpc-c/tvm-hpc/README.md b/examples/hpc-c/tvm-hpc/README.md index 4aa72741..bd6b495e 100644 --- a/examples/hpc-c/tvm-hpc/README.md +++ b/examples/hpc-c/tvm-hpc/README.md @@ -2,7 +2,7 @@ This build is only tested on python3.11. # Installing dependencies -## Installing TVM +## Installing TVM with Headsail backend Get sources ``` sh git clone --recursive --depth=1 https://github.com/soc-hub-fi/headsail-tvm tvm @@ -19,7 +19,6 @@ cp /build/config.cma For example ``` sh cp headsail-vp/examples/hpc-c/tvm-hpc/config.cmake tvm/build/config.cmake - ``` To enable codegen modify config.cmake file in the build directory by setting line 162 value to pointing at llvm-config. @@ -31,7 +30,7 @@ set(USE_HEADSAIL ON) ### Building TVM -Build in the previously greated build directory +Build in the previously created build directory ``` sh cd build cmake .. @@ -49,7 +48,7 @@ More information in https://tvm.apache.org/docs/install/from_source.html ## Python dependencies -Python dependencies are needed for building TVM models from onnx graphs and must be available during tvm-hpc compilation. +Python dependencies are needed for building TVM models from TFLite graphs and must be available during tvm-hpc compilation. Install python dependencies for TVM ``` sh @@ -67,7 +66,7 @@ pip install -r requirements.txt # Building project ## Fetching the datasets -To run Tinyperf benchmark we need to obtain the needed datasets. Easiest way to do this is by runnning the `get_testing_data` script. +To run Tinyperf benchmark we need to obtain the needed datasets. Easiest way to do this is by running the `get_testing_data` script. ```sh ./get_testing_data.sh ``` @@ -77,13 +76,14 @@ In project folder (tvm-hpc) ```sh mkdir build cd build -cmake .. +cmake .. -DUSE_PERF_KEYWORD_SPOTTING=ON -DUSE_ACCELERATOR=ON make ``` -This creates a binary called headsail-tvm +This creates a binary called headsail-tvm with model for MLPerf Tiny Keyword Spotting task embedded, with convolutions assigned for the DLA. +Other options for models are: `[-DUSE_PERF_IMAGE_CLASSIFICATION=ON,-DUSE_PERF_VISUAL_WAKE_WORDS=ON]`. The use of accelerator is controlled with the `-DUSE_ACCELERATOR=[ON/OFF]` flag. -# Running in renode -After succesful build, the resulting binary can be run with Headsail's virtual prototype in Renode +# Running in Renode +After successful build, the resulting binary can be run with Headsail's virtual prototype in Renode ```sh cd /headsail-vp/scripts ./run_on_hpc.sh ../examples/hpc-c/tvm-hpc/build/headsail-tvm @@ -92,5 +92,5 @@ cd /headsail-vp/scripts ## Running the benchmark To run the TinyPerf benchmark run the tiny_perf_benchmark.py script with the `-b` options with the wanted benchmark `[ic, kws, vww]`. ```sh -python tiny_perf_benchmark.py -b ic +python tiny_perf_benchmark.py -b kws ``` diff --git a/examples/hpc-c/tvm-hpc/tiny_perf_benchmark.py b/examples/hpc-c/tvm-hpc/tiny_perf_benchmark.py index c3360423..d4c7d594 100644 --- a/examples/hpc-c/tvm-hpc/tiny_perf_benchmark.py +++ b/examples/hpc-c/tvm-hpc/tiny_perf_benchmark.py @@ -15,36 +15,40 @@ import time import numpy as np -UART = '/tmp/uart0' +UART = "/tmp/uart0" ROOT_PATH = Path(__file__).parents[0] DATA_DIR = ROOT_PATH / "dev_data" KWS_DATA_DIR = DATA_DIR / "kws01" VWW_DATA_DIR = DATA_DIR / "vw_coco2014_96" -VWW_NON_PERSON_DATA_DIR = VWW_DATA_DIR / "non_person" +VWW_NON_PERSON_DATA_DIR = VWW_DATA_DIR / "non_person" VWW_PERSON_DATA_DIR = VWW_DATA_DIR / "person" IC_DATA_DIR = DATA_DIR / "cifar-10-batches-py" AD_DATA_DIR = DATA_DIR / "ToyCar" / "test" # UTILS -def print_matrix(arr, format_type='signed'): +def print_matrix(arr, format_type="signed"): for row in arr: for elem in row: - if format_type == 'signed': + if format_type == "signed": print(f"{int(elem):d}", end="\t") # Signed decimal - elif format_type == 'unsigned': + elif format_type == "unsigned": print(f"{int(elem) & 0xFFFFFFFF:d}", end="\t") # Unsigned decimal - elif format_type == 'hex': + elif format_type == "hex": print(f"{int(elem) & 0xFFFFFFFF:08x}", end="\t") # Hexadecimal else: - raise ValueError("Invalid format_type. Use 'signed', 'unsigned', or 'hex'.") + raise ValueError( + "Invalid format_type. Use 'signed', 'unsigned', or 'hex'." + ) print() + def accuracy_report(gt, prediction): print("Accuracy: {:.3f}".format(accuracy_score(gt, prediction))) print("Confusion matrix:\n{}".format(confusion_matrix(gt, prediction))) print(classification_report(gt, prediction)) + def send_stimulus(data, label=None): print("Writing {} bytes as stimulus...".format(len(data))) if label is not None: @@ -54,19 +58,20 @@ def send_stimulus(data, label=None): ser.write(bytes(data)) ser.close() + def wait_for_result(): print("Waiting for results...") - ser = serial.Serial(UART, 9600) + ser = serial.Serial(UART, 9600) output = ser.readline() - while output != b'Prediction:\n': + while output != b"Prediction:\n": output = ser.readline() output = ser.readline() ser.close() output = bytearray(output) results = [] for x in output: - results.append(((x & 0xff) ^ 0x80) - 0x80) # Append signed - results = results[:-1] # Remove line break + results.append(((x & 0xFF) ^ 0x80) - 0x80) # Append signed + results = results[:-1] # Remove line break print(results) print("Predicted class: {}".format(np.argmax(results))) print("\n") @@ -79,14 +84,20 @@ def read_kws_file(path): content = file.read() return content + def get_kws_stimulus(): - df = pd.read_csv(KWS_DATA_DIR / "y_labels.csv", names=["filename", "no_classes", "class"]) + df = pd.read_csv( + KWS_DATA_DIR / "y_labels.csv", names=["filename", "no_classes", "class"] + ) data = read_kws_file(KWS_DATA_DIR / df["filename"][0]) print("Expected label:", df["class"][0]) return data + def run_kws(total_samples=200): - df = pd.read_csv(KWS_DATA_DIR / "y_labels.csv", names=["filename", "no_classes", "class"]) + df = pd.read_csv( + KWS_DATA_DIR / "y_labels.csv", names=["filename", "no_classes", "class"] + ) class_counts = df["class"].value_counts() @@ -116,13 +127,13 @@ def run_kws(total_samples=200): balanced_df = balanced_df.sample(frac=1, random_state=42) predictions = [] - for (i, filename) in enumerate(balanced_df["filename"]): + for i, filename in enumerate(balanced_df["filename"]): data = read_kws_file(KWS_DATA_DIR / filename) send_stimulus(data, df["class"][i]) predictions.append(np.argmax(wait_for_result())) # Mid run report - accuracy_report(balanced_df["class"][:len(predictions)], predictions) + accuracy_report(balanced_df["class"][: len(predictions)], predictions) print("Final accuracy report for Keyword Spotting:") accuracy_report(balanced_df["class"], predictions) @@ -130,31 +141,53 @@ def run_kws(total_samples=200): # VWW def read_vww_file(path): - #Image loading and preprocessing + # Image loading and preprocessing image = tf.io.read_file(str(path)) image = tf.image.decode_jpeg(image, channels=3) - image = tf.image.resize(image, [96,96]) + image = tf.image.resize(image, [96, 96]) image = np.array(image, dtype=np.int8) image = image - 128 return image.astype(np.int8) + def get_vww_stimulus(): items = os.listdir(VWW_NON_PERSON_DATA_DIR) - non_persons = [item for item in items if os.path.isfile(os.path.join(VWW_NON_PERSON_DATA_DIR, item)) and item.startswith("COCO_val")] + non_persons = [ + item + for item in items + if os.path.isfile(os.path.join(VWW_NON_PERSON_DATA_DIR, item)) + and item.startswith("COCO_val") + ] data = read_vww_file(VWW_NON_PERSON_DATA_DIR / non_persons[0]) print("Expected label: 1") return data.tobytes() + def run_vww(total_samples=100): items = os.listdir(VWW_NON_PERSON_DATA_DIR) - non_persons = [item for item in items if os.path.isfile(os.path.join(VWW_NON_PERSON_DATA_DIR, item)) and item.startswith("COCO_val")] + non_persons = [ + item + for item in items + if os.path.isfile(os.path.join(VWW_NON_PERSON_DATA_DIR, item)) + and item.startswith("COCO_val") + ] + non_persons.sort() items = os.listdir(VWW_PERSON_DATA_DIR) - persons = [item for item in items if os.path.isfile(os.path.join(VWW_PERSON_DATA_DIR, item)) and item.startswith("COCO_val")] + persons = [ + item + for item in items + if os.path.isfile(os.path.join(VWW_PERSON_DATA_DIR, item)) + and item.startswith("COCO_val") + ] + persons.sort() print("Number of non_persons", len(non_persons)) print("Number of persons", len(persons)) - print("Input shape: ", np.shape(read_vww_file(VWW_NON_PERSON_DATA_DIR / non_persons[0]))) + print( + "Input shape: ", + np.shape(read_vww_file(VWW_NON_PERSON_DATA_DIR / non_persons[0])), + ) # Calculate balanced number of samples for each category samples_per_class = min(len(non_persons), len(persons), total_samples // 2) @@ -186,22 +219,26 @@ def run_vww(total_samples=100): print("Final accuracy report for Visual Wakeup Word:") accuracy_report(gt, predictions) + # IC def get_ic_stimulus(): import pickle + with open(IC_DATA_DIR / "test_batch", "rb") as file: - data = pickle.load(file, encoding='bytes') - print("Expected label:", data[b'labels'][0]) - return data[b'data'][0].tobytes() + data = pickle.load(file, encoding="bytes") + print("Expected label:", data[b"labels"][0]) + return data[b"data"][0].tobytes() + def run_ic(total_samples=200): import pickle + with open(IC_DATA_DIR / "test_batch", "rb") as file: - data = pickle.load(file, encoding='bytes') - print("Input shape: {}".format(np.shape(data[b'data'][0]))) + data = pickle.load(file, encoding="bytes") + print("Input shape: {}".format(np.shape(data[b"data"][0]))) - images = data[b'data'] - labels = data[b'labels'] + images = data[b"data"] + labels = data[b"labels"] print(labels) class_samples = {i: [] for i in range(10)} @@ -234,8 +271,8 @@ def run_ic(total_samples=200): print(selected_labels) # Run inference on samples - for (i, image) in enumerate(selected_images): - #FROM CHW to HWC + for i, image in enumerate(selected_images): + # FROM CHW to HWC image = np.reshape(image, (3, 32, 32)) image = np.rollaxis(image, 0, 3) image = image - 128 @@ -244,11 +281,11 @@ def run_ic(total_samples=200): send_stimulus(image.tobytes(), label) # Wait for inference result - prediction = (np.argmax(wait_for_result())) + prediction = np.argmax(wait_for_result()) predictions.append(prediction) # Mid-run report - accuracy_report(selected_labels[:len(predictions)], predictions) + accuracy_report(selected_labels[: len(predictions)], predictions) print("Final accuracy report for Image Classification:") accuracy_report(selected_labels, predictions) @@ -277,5 +314,6 @@ def main(): else: print("Bad benchmark! Available benchmarks are: kws, vww, ic") + if __name__ == "__main__": main() diff --git a/examples/hpc/dla-driver-ffi/src/lib.rs b/examples/hpc/dla-driver-ffi/src/lib.rs index 5dec0c2b..648bde39 100644 --- a/examples/hpc/dla-driver-ffi/src/lib.rs +++ b/examples/hpc/dla-driver-ffi/src/lib.rs @@ -12,8 +12,8 @@ use core::slice; use dla_driver::layers::{conv2d, conv2d_bias, conv2d_bias_relu, conv2d_relu, grouped_conv2d}; use dla_driver::tensor3::{rescale, Order3, Tensor3}; use dla_driver::tensor4::{Order4, Tensor4}; -use dla_driver::{Padding, Stride}; use dla_driver::utils::optimal_pp_bias_heuristic; +use dla_driver::{Padding, Stride}; /// Converts C-types to DLA Tensors for use with the highlevel layer unsafe fn ffi_data_import( @@ -72,7 +72,7 @@ unsafe fn ffi_data_import( /// Initializes DLA by setting up necessary heap allocator from headsail-bsp. This should be called only once in the program. #[no_mangle] pub unsafe extern "C" fn dla_init() { - headsail_bsp::init_alloc(); + headsail_bsp::init_heap(); } /// Executes Conv2D on DLA with given parameters and writes result to output buffer. @@ -412,11 +412,12 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_bias( let bias: Vec = unsafe { slice::from_raw_parts(bias as *const i32, bias_length) .into_iter() - .map(|x| (*x >> 8).clamp(i16::MIN as i32, i16::MAX as i32) as i16) + .map(|x| (*x).clamp(i16::MIN as i32, i16::MAX as i32) as i16) .collect() }; - let optimized_pp = optimal_pp_bias_heuristic(&bias); + //let optimized_pp = optimal_pp_bias_heuristic(&bias); + let optimized_pp = 7; let mut result: Tensor3 = conv2d_bias( input_tensor, @@ -446,16 +447,13 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_bias( // To comply with TVM's expected value range our solution is to bit shift/clip the 16-bit result of // conv2d by 8 bits and shift if back in the driver. This causes some amount of data loss due to // the lost granularity of the values. The clipping amount is set by the pp_clip argument. - let mut res_i32: Vec = result.to_buffer() - .iter().map(|x: &i8| ((*x as i32) << optimized_pp)).collect(); + let mut res_i32: Vec = result + .to_buffer() + .iter() + .map(|x: &i8| ((*x as i32) << optimized_pp)) + .collect(); - unsafe { - core::ptr::copy_nonoverlapping( - res_i32.as_mut_ptr(), - output, - result.get_size(), - ) - }; + unsafe { core::ptr::copy_nonoverlapping(res_i32.as_mut_ptr(), output, result.get_size()) }; } /// # Arguments @@ -509,7 +507,7 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_grouped_bias( let bias: Vec = unsafe { slice::from_raw_parts(bias as *const i32, bias_length) .into_iter() - .map(|x| (*x >> 8).clamp(i16::MIN as i32, i16::MAX as i32) as i16) + .map(|x| (*x).clamp(i16::MIN as i32, i16::MAX as i32) as i16) .collect() }; @@ -533,10 +531,9 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_grouped_bias( Some(mac_clip), Some(optimized_pp), None, - groups + groups, ); - let input_order_string = unsafe { CStr::from_ptr(input_order).to_str().unwrap_unchecked() }; // TVM requantization and clip @@ -545,14 +542,11 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_grouped_bias( // To comply with TVM's expected value range our solution is to bit shift/clip the 16-bit result of // conv2d by 8 bits and shift if back in the driver. This causes some amount of data loss due to // the lost granularity of the values. The clipping amount is set by the pp_clip argument. - let mut res_i32: Vec = result.to_buffer() - .iter().map(|x: &i8| (*x as f32 * u32::pow(2, optimized_pp) as f32) as i32).collect(); + let mut res_i32: Vec = result + .to_buffer() + .iter() + .map(|x: &i8| ((*x as i32) << optimized_pp)) + .collect(); - unsafe { - core::ptr::copy_nonoverlapping( - res_i32.as_mut_ptr(), - output, - result.get_size(), - ) - }; + unsafe { core::ptr::copy_nonoverlapping(res_i32.as_mut_ptr(), output, result.get_size()) }; } diff --git a/examples/hpc/dla-driver/src/tensor3.rs b/examples/hpc/dla-driver/src/tensor3.rs index dd55aeb0..ad30c38b 100644 --- a/examples/hpc/dla-driver/src/tensor3.rs +++ b/examples/hpc/dla-driver/src/tensor3.rs @@ -1,6 +1,6 @@ use alloc::vec::*; use core::ffi::c_char; -use ndarray::{s, Array, Array3}; +use ndarray::{Array, Array3, ArrayView3 , Axis, s, stack, concatenate}; #[derive(Clone, Copy, Debug, PartialEq)] pub enum Order3 { @@ -183,27 +183,11 @@ impl Tensor3 { self.order } - /// Concate vector of Tensor3 in order to single Tensor3 - pub fn concat(tensors: Vec>, axis: usize) -> Tensor3 { - let target_order = tensors[0].order(); - let arrays: Vec> = tensors.into_iter().map(|t| t.data).collect(); - // Concatenate along the specified axis - let stacked = concatenate(Axis(axis), &arrays.iter().map(|a| a.view()).collect::>()) - .expect("Concatenation failed due to incompatible shapes"); - - Tensor3 { - data: stacked, - order: target_order, - } - } - /// Concatenates a Tensor along the least significant axis (axis=2) by interleaving the tensors pub fn concat_interleaved(tensors: Vec>) -> Tensor3 { let target_order = tensors[0].order(); - let (height, width, channels) = (tensors[0].height(), tensors[0].width(), tensors[0].channels()); let mut intermediary_buffer: Vec = Vec::with_capacity(height * width * channels * tensors.len()); - for h in 0..height { for w in 0..width { for c in 0..channels { @@ -287,6 +271,7 @@ impl Tensor3 { data.permute(order); data.to_buffer() } + } pub fn rescale( @@ -316,6 +301,7 @@ pub fn rescale( let value = (input_scale / scale) * (*x as f32 * pre_scale - input_zero as f32) + output_zero as f32; *x = value.clamp(i8::MIN as f32, i8::MAX as f32) as i8 + }); } } diff --git a/examples/hpc/dla-driver/src/utils.rs b/examples/hpc/dla-driver/src/utils.rs index 160d4495..deeb646c 100644 --- a/examples/hpc/dla-driver/src/utils.rs +++ b/examples/hpc/dla-driver/src/utils.rs @@ -127,5 +127,4 @@ pub fn optimal_pp_bias_heuristic(bias: &Vec) -> u32 { return 8 } pp - }