diff --git a/programming_examples/ml/resnet/ptq_conv2x/Makefile b/programming_examples/ml/resnet/ptq_conv2x/Makefile new file mode 100755 index 0000000000..79e443d308 --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/Makefile @@ -0,0 +1,49 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +include ${srcdir}/../../../makefile-common + +mlirFileName = aie + +VPATH := ${srcdir}/../../../../aie_kernels/aie2 + +all: build/conv2dk1_i8.o build/conv2dk1_skip_init.o build/conv2dk3.o build/conv2dk1_skip.o build/conv2dk1_ui8.o build/final.xclbin + +build/${mlirFileName}.mlir: ${srcdir}/aie2.py + mkdir -p ${@D} + python3 $< > $@ +insts.txt: build/${mlirFileName}.mlir + aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $< + +build/conv2dk1_i8.o: conv2dk1.cc + xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@ + +build/conv2dk3.o: conv2dk3.cc + xchesscc -d ${CHESSCC2_FLAGS} -DUINT8_ACT -c $< -o $@ + +build/conv2dk1_skip_init.o: conv2dk1_skip_init.cc + xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@ + +build/conv2dk1_ui8.o: conv2dk1.cc + xchesscc -d ${CHESSCC2_FLAGS} -DUINT8_ACT -c $< -o $@ + +build/conv2dk1_skip.o: conv2dk1_skip.cc + xchesscc -d ${CHESSCC2_FLAGS} -DUINT8_ACT -c $< -o $@ + +build/final.xclbin: build/${mlirFileName}.mlir build/conv2dk1_i8.o build/conv2dk1_skip_init.o build/conv2dk3.o build/conv2dk1_skip.o build/conv2dk1_ui8.o + cd build && aiecc.py --basic-alloc-scheme --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts.txt ${ + +# ResNet with Offloaded Conv2_x Layers and Post-Training Quantization + +Quantization involves reducing the precision of the weights and activations of a neural network from floating-point (e.g., 32-bit float) to lower bit-width formats (e.g., 8-bit integer). Quantization reduces model size and speeds up inference, making a model more suitable for deployment on resource-constrained devices. In AI Engine (AIE), we use a power-of-two scale factor for the SRS (Shift-Round-Saturate) Unit to shift and scale the values to an integer range. A power of two is a number of the form 2^n, where n is an integer. Power-of-two scale factors can lead to more efficient hardware implementations, as multiplication by a power of two can be performed using bit shifts rather than more complex multiplication operations. + +[Brevitas](https://github.com/Xilinx/brevitas) is a PyTorch-based library designed for quantization of neural networks. It enables users to train models with reduced numerical precision, typically using lower bit widths for weights and activations, which can lead to significant improvements in computational efficiency and memory usage. Brevitas supports various quantization schemes, including uniform and non-uniform quantization, and can be used to target a wide range of hardware platforms, including FPGAs, ASICs, and AIEs. + +We use Brevitas to: +1. Quantize weights and activations of a model to lower bit format for AIE deployment, and +2. Extract proper power-of-two scale factors to set up the SRS unit. + +## Source Files Overview + +``` +. ++-- ptq_conv2x # Implementation of ResNet conv2_x layers on NPU with PTQ. ++-- +-- data # Labels for CIFAR dataset. +| +-- aie2.py # A Python script that defines the AIE array structural design using MLIR-AIE operations. +| +-- Makefile # Contains instructions for building and compiling software projects. +| +-- model.py # Python code for ResNet Model where we apply PTQ. +| +-- README.md # This file. +| +-- requirements.txt # pip requirements to perform PTQ. +| +-- run_makefile.lit # For LLVM Integrated Tester (LIT) of the design. +| +-- test.py # Python code testbench for the design example. +| +-- utils.py # Python code for miscellaneous functions needed for inference. + + +``` + +# Post-Training Quantization Using Brevitas +To enhance the efficiency of our implementation, we perform post-training quantization on the model using the Brevitas library. This step converts the model to use 8-bit weights and activations and power-of-two scale factors, optimizing it for deployment on hardware with limited precision requirements. + + +## Step-by-Step Process +We use test.py to perform the following key tasks: + +**1. Loading the Pre-trained ResNet Model**: The script begins by loading a pre-trained ResNet model, which serves as the baseline for quantization and inference. + +**2. Applying Post-Training Quantization (PTQ)**: Using the Brevitas library, the script applies PTQ to the conv2_x layers of the ResNet model. This involves converting the weights and activations to 8-bit precision. + +**3. Extracting Power-of-Two Scale Factors**: After quantizing the weights and activations, the script extracts the power-of-two scale factors. These factors are crucial for efficient hardware implementation, as they simplify multiplication operations to bit shifts. + +**4. Calculating Combined Scales**: The combined scale factors are calculated by multiplying the extracted weight and activation scales for each layer. These combined scales are then used to set up the SRS unit. + +**5. Setting Up the SRS Unit**: +The SRS unit uses the calculated combined scales to efficiently shift and scale the values to the integer range required for the NPU. + +**6. Running Inference**: Finally, the script runs inference on the quantized model. The conv2_x layers are offloaded to the NPU, utilizing the SRS unit to scale the quantized weights and activations to the int8 range properly. + +# Compilation and Execution + +## Prerequisites +Ensure you have the necessary dependencies installed. You can install the required packages using: + +``` +pip install -r requirements.txt +``` +## Compilation +To compile the design: +``` +make +``` + +## Running the Design + +To run the design: +``` +make run_py +``` diff --git a/programming_examples/ml/resnet/ptq_conv2x/aie2.py b/programming_examples/ml/resnet/ptq_conv2x/aie2.py new file mode 100755 index 0000000000..334e1431bc --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/aie2.py @@ -0,0 +1,1011 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.dialects.ext import memref, arith +from aie.dialects.scf import for_, yield_ +from aie.extras.context import mlir_mod_ctx +from aie.ir import MemRefType, TypeAttr + +import sys + +# tracing definitions +trace_sz_in_bytes = 8192 +trace_sz_in_i32s = trace_sz_in_bytes // 4 +enableTrace = False + +# Define bottleneck layer sizes + + +def resnet_conv_x(): + + tensorInW = 32 + tensorInH = 32 + tensorInCInit = 64 + tensorInCRest = 4 * tensorInCInit + n_cols = 3 + repeat = 2 + + with mlir_mod_ctx() as ctx: + + @device(AIEDevice.npu) + def deviceBody(): + + # define types + uint8_ty = IntegerType.get_unsigned(8) + int8_ty = IntegerType.get_signless(8) + int32_ty = IntegerType.get_signless(32) + + tensorLayer1In_ty_init = MemRefType.get( + ( + tensorInW, + 1, + tensorInCInit, + ), + int8_ty, + ) + tensorLayer1In_ty_rest = MemRefType.get( + ( + tensorInW, + 1, + tensorInCRest, + ), + uint8_ty, + ) + weightsLayer1_ty_init = MemRefType.get( + (tensorInCInit * tensorInCInit,), int8_ty + ) + weightsLayer1_ty_rest = MemRefType.get( + (tensorInCRest * tensorInCInit,), int8_ty + ) + + tensorLayer1Out_ty = MemRefType.get( + ( + tensorInW, + 1, + tensorInCInit, + ), + uint8_ty, + ) + + tensorLayer2In_ty = MemRefType.get( + ( + tensorInW, + 1, + tensorInCInit, + ), + uint8_ty, + ) + weightsLayer2_ty = MemRefType.get( + (3 * 3 * tensorInCInit * tensorInCInit,), int8_ty + ) + tensorLayer2Out_ty = MemRefType.get( + ( + tensorInW, + 1, + tensorInCInit // 2, + ), + uint8_ty, + ) + + tensorLayer3In_ty = MemRefType.get( + ( + tensorInW, + 1, + tensorInCInit // 2, + ), + uint8_ty, + ) + weightsLayer3_ty_init = MemRefType.get( + (2 * tensorInCInit * tensorInCRest,), int8_ty + ) + weightsLayer3_ty_rest = MemRefType.get( + (tensorInCRest // 4 * tensorInCRest,), int8_ty + ) + + tensorLayer3Out_ty = MemRefType.get( + ( + tensorInW, + 1, + tensorInCRest, + ), + uint8_ty, + ) + + allWeights_ty_init = MemRefType.get( + ( + tensorInCInit * tensorInCInit + + 3 * 3 * tensorInCInit * tensorInCInit + + tensorInCInit * tensorInCRest + + tensorInCInit * tensorInCRest, + ), + int8_ty, + ) + + allWeights_ty_rest = MemRefType.get( + ( + tensorInCRest * tensorInCInit + + 3 * 3 * tensorInCInit * tensorInCInit + + tensorInCInit * tensorInCRest, + ), + int8_ty, + ) + + # kernel definitions + conv2dk1_i8 = external_func( + "conv2dk1_i8", + inputs=[ + tensorLayer1In_ty_init, + weightsLayer1_ty_init, + tensorLayer1Out_ty, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + ], + ) + conv2dk3 = external_func( + "conv2dk3_ui8", + inputs=[ + tensorLayer2In_ty, + tensorLayer2In_ty, + tensorLayer2In_ty, + weightsLayer2_ty, + tensorLayer2Out_ty, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + ], + ) + conv2dk1_skip_init_i8 = external_func( + "conv2dk1_skip_init_i8", + inputs=[ + tensorLayer3In_ty, + tensorLayer3In_ty, + weightsLayer3_ty_init, + tensorLayer3Out_ty, + tensorLayer1In_ty_init, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + ], + ) + conv2dk1_ui8 = external_func( + "conv2dk1_ui8", + inputs=[ + tensorLayer3Out_ty, + weightsLayer1_ty_rest, + tensorLayer1Out_ty, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + ], + ) + + conv2dk1_skip_ui8 = external_func( + "conv2dk1_skip_ui8", + inputs=[ + tensorLayer3In_ty, + tensorLayer3In_ty, + weightsLayer3_ty_rest, + tensorLayer3Out_ty, + tensorLayer3Out_ty, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + ], + ) + + ShimTile00 = tile(0, 0) + MemTile01 = tile(0, 1) + ComputeTile02 = tile(0, 2) + ComputeTile03 = tile(0, 3) + ComputeTile04 = tile(0, 4) + ComputeTile05 = tile(0, 5) + + ShimTile10 = tile(1, 0) + MemTile11 = tile(1, 1) + ComputeTile12 = tile(1, 2) + ComputeTile13 = tile(1, 3) + ComputeTile14 = tile(1, 4) + ComputeTile15 = tile(1, 5) + + ShimTile20 = tile(2, 0) + MemTile21 = tile(2, 1) + ComputeTile22 = tile(2, 2) + ComputeTile23 = tile(2, 3) + ComputeTile24 = tile(2, 4) + ComputeTile25 = tile(2, 5) + + shims = [ShimTile00, ShimTile10, ShimTile20] + mems = [MemTile01, MemTile11, MemTile21] + wts_sizes = [allWeights_ty_init, allWeights_ty_rest, allWeights_ty_rest] + layer1_wts_sizes = [ + weightsLayer1_ty_init, + weightsLayer1_ty_rest, + weightsLayer1_ty_rest, + ] + laye1_act_sizes = [ + tensorLayer1In_ty_init, + tensorLayer1In_ty_rest, + tensorLayer1In_ty_rest, + ] + layer3_wts_sizes = [ + weightsLayer3_ty_init, + weightsLayer3_ty_rest, + weightsLayer3_ty_rest, + ] + + cores = [ + [ComputeTile02, ComputeTile03, ComputeTile04, ComputeTile05], + [ComputeTile15, ComputeTile14, ComputeTile13, ComputeTile12], + [ComputeTile22, ComputeTile23, ComputeTile24, ComputeTile25], + ] + + if enableTrace: + flow(ComputeTile04, WireBundle.Trace, 0, ShimTile00, WireBundle.DMA, 1) + + # runtime parameters + + rtpComputeTile02 = Buffer(ComputeTile02, [16], T.i32(), "rtpComputeTile02") + rtpComputeTile03 = Buffer(ComputeTile03, [16], T.i32(), "rtpComputeTile03") + rtpComputeTile04 = Buffer(ComputeTile05, [16], T.i32(), "rtpComputeTile04") + rtpComputeTile05 = Buffer(ComputeTile04, [16], T.i32(), "rtpComputeTile05") + + rtpComputeTile12 = Buffer(ComputeTile12, [16], T.i32(), "rtpComputeTile12") + rtpComputeTile13 = Buffer(ComputeTile13, [16], T.i32(), "rtpComputeTile13") + rtpComputeTile14 = Buffer(ComputeTile14, [16], T.i32(), "rtpComputeTile14") + rtpComputeTile15 = Buffer(ComputeTile15, [16], T.i32(), "rtpComputeTile15") + + rtpComputeTile22 = Buffer(ComputeTile22, [16], T.i32(), "rtpComputeTile22") + rtpComputeTile23 = Buffer(ComputeTile23, [16], T.i32(), "rtpComputeTile23") + rtpComputeTile24 = Buffer(ComputeTile24, [16], T.i32(), "rtpComputeTile24") + rtpComputeTile25 = Buffer(ComputeTile25, [16], T.i32(), "rtpComputeTile25") + + rtp = [ + [ + rtpComputeTile02, + rtpComputeTile03, + rtpComputeTile04, + rtpComputeTile05, + ], + [ + rtpComputeTile15, + rtpComputeTile14, + rtpComputeTile13, + rtpComputeTile12, + ], + [ + rtpComputeTile22, + rtpComputeTile23, + rtpComputeTile24, + rtpComputeTile25, + ], + ] + rtp_name = [ + [ + "rtpComputeTile02", + "rtpComputeTile03", + "rtpComputeTile04", + "rtpComputeTile05", + ], + [ + "rtpComputeTile12", + "rtpComputeTile13", + "rtpComputeTile14", + "rtpComputeTile15", + ], + [ + "rtpComputeTile22", + "rtpComputeTile23", + "rtpComputeTile24", + "rtpComputeTile25", + ], + ] + # set up data movement with OFs + conv1_kernels = ["conv2dk1_i8.o", "conv2dk1_ui8.o", "conv2dk1_ui8.o"] + conv1_kernels_call = [conv2dk1_i8, conv2dk1_ui8, conv2dk1_ui8] + + conv3_kernels = [ + "conv2dk1_skip_init.o", + "conv2dk1_skip.o", + "conv2dk1_skip.o", + ] + conv3_kernels_call = [ + conv2dk1_skip_init_i8, + conv2dk1_skip_ui8, + conv2dk1_skip_ui8, + ] + + act1_fifo_names = ["act1_00_02_01", "act1_04_15_01", "act1_13_22_21"] + act1_fifos = {} + + # input tensor (with broadcast for skip connection) + act1_fifo_names = ["act1_00_02_01", "act1_04_15_11", "act1_13_22_21"] + act1_fifos = {} + + skip_fifo_names = ["skip_0", "skip_1", "skip_2"] + skip_fifos = {} + + act1_fifos[act1_fifo_names[0]] = object_fifo( + act1_fifo_names[0], + shims[0], + [cores[0][0], mems[0]], + [2, 2, 4], + laye1_act_sizes[0], + ) + skip_fifos[skip_fifo_names[0]] = object_fifo( + skip_fifo_names[0], mems[0], cores[0][2], 2, laye1_act_sizes[0] + ) + object_fifo_link(act1_fifo_names[0], skip_fifo_names[0]) + + for i in range(1, repeat + 1): + if i == 1: + act1_fifos[act1_fifo_names[i]] = object_fifo( + act1_fifo_names[i], + cores[i - 1][2], + [cores[i][0], mems[i - 1]], + [2, 2, 4], + laye1_act_sizes[i], + ) + skip_fifos[skip_fifo_names[i]] = object_fifo( + skip_fifo_names[i], + mems[i - 1], + cores[i][2], + 2, + laye1_act_sizes[i], + ) + object_fifo_link(act1_fifo_names[i], skip_fifo_names[i]) + else: + act1_fifos[act1_fifo_names[i]] = object_fifo( + act1_fifo_names[i], + cores[i - 1][2], + [cores[i][0], mems[i]], + [2, 2, 4], + laye1_act_sizes[i], + ) + skip_fifos[skip_fifo_names[i]] = object_fifo( + skip_fifo_names[i], + mems[i], + cores[i][2], + 2, + laye1_act_sizes[i], + ) + object_fifo_link(act1_fifo_names[i], skip_fifo_names[i]) + + act2_fifo_names = ["act2_02_03_05", "act2_15_12_14", "act2_22_23_25"] + act2_fifos = {} + + act3_fifo_names_1 = ["act3_03_04", "act3_14_13", "act3_23_24"] + act3_fifo_1 = {} + + act3_fifo_names_2 = ["act3_05_04", "act3_12_13", "act3_25_24"] + act3_fifo_2 = {} + + for i in range(n_cols): + if i == 1: + # 1x1 -> 3x3 + act2_fifos[act2_fifo_names[i]] = object_fifo( + act2_fifo_names[i], + cores[i][0], + [cores[i][3], cores[i][1]], + 4, + tensorLayer1Out_ty, + ) + + # 3x3 -> 1x1 + act3_fifo_1[act3_fifo_names_1[i]] = object_fifo( + act3_fifo_names_1[i], + cores[i][1], + cores[i][2], + 2, + tensorLayer2Out_ty, + ) + # 3x3 -> 1x1 + act3_fifo_2[act3_fifo_names_2[i]] = object_fifo( + act3_fifo_names_2[i], + cores[i][3], + cores[i][2], + 2, + tensorLayer2Out_ty, + ) + else: + # 1x1 -> 3x3 + act2_fifos[act2_fifo_names[i]] = object_fifo( + act2_fifo_names[i], + cores[i][0], + [cores[i][1], cores[i][3]], + 4, + tensorLayer1Out_ty, + ) + + # 3x3 -> 1x1 + act3_fifo_1[act3_fifo_names_1[i]] = object_fifo( + act3_fifo_names_1[i], + cores[i][1], + cores[i][2], + 2, + tensorLayer2Out_ty, + ) + # 3x3 -> 1x1 + act3_fifo_2[act3_fifo_names_2[i]] = object_fifo( + act3_fifo_names_2[i], + cores[i][3], + cores[i][2], + 2, + tensorLayer2Out_ty, + ) + wts_fifo_names = ["wts_0_L3L2", "wts_1_L3L2", "wts_2_L3L2"] + wts_fifos = {} + wts_sub_fifo_names = [ + ["wts_buf_00", "wts_buf_01", "wts_buf_02"], + ["wts_buf_10", "wts_buf_11", "wts_buf_12"], + ["wts_buf_20", "wts_buf_21", "wts_buf_22"], + ] + wts_sub_fifos = {} + + for i in range(n_cols): + + wts_fifos[wts_fifo_names[i]] = object_fifo( + wts_fifo_names[i], shims[i], mems[i], 1, wts_sizes[i] + ) + wts_sub_fifos[wts_sub_fifo_names[i][0]] = object_fifo( + wts_sub_fifo_names[i][0], + mems[i], + cores[i][0], + 1, + layer1_wts_sizes[i], + ) + if i == 1: + wts_sub_fifos[wts_sub_fifo_names[i][1]] = object_fifo( + wts_sub_fifo_names[i][1], + mems[i], + [cores[i][3], cores[i][1]], + 1, + weightsLayer2_ty, + ) + + else: + wts_sub_fifos[wts_sub_fifo_names[i][1]] = object_fifo( + wts_sub_fifo_names[i][1], + mems[i], + [cores[i][1], cores[i][3]], + 1, + weightsLayer2_ty, + ) + wts_sub_fifos[wts_sub_fifo_names[i][2]] = object_fifo( + wts_sub_fifo_names[i][2], + mems[i], + cores[i][2], + 1, + layer3_wts_sizes[i], + ) + object_fifo_link( + wts_fifo_names[i], + [ + wts_sub_fifo_names[i][0], + wts_sub_fifo_names[i][1], + wts_sub_fifo_names[i][2], + ], + ) + # output tensor + outOFL2L3 = object_fifo( + "outOFL2L3", cores[2][2], shims[1], 2, tensorLayer3Out_ty + ) + conv3_out_fifo = [ + act1_fifos[act1_fifo_names[1]], + act1_fifos[act1_fifo_names[2]], + outOFL2L3, + ] + conv3_out_fifo_names = ["act1_04_15_11", "act1_13_22_21", "outOFL2L3"] + # # 1x1 conv2d + for i in range(n_cols): + + @core(cores[i][0], conv1_kernels[i]) + def core_body(): + for _ in for_(sys.maxsize): + + # acquire weights once + element0Weights = wts_sub_fifos[ + wts_sub_fifo_names[i][0] + ].acquire(ObjectFifoPort.Consume, 1) + scale = memref.load(rtp[i][0], [0]) + for _ in for_(tensorInH): + element0ActivactionsIn = act1_fifos[ + act1_fifo_names[i] + ].acquire(ObjectFifoPort.Consume, 1) + element0ActivactionsOut = act2_fifos[ + act2_fifo_names[i] + ].acquire(ObjectFifoPort.Produce, 1) + if i == 0: + res = call( + conv1_kernels_call[i], + [ + element0ActivactionsIn, + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit, + scale, + ], + ) + else: + res = call( + conv1_kernels_call[i], + [ + element0ActivactionsIn, + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCRest, + tensorInCInit, + scale, + ], + ) + + objectfifo_release( + ObjectFifoPort.Consume, act1_fifo_names[i], 1 + ) + + objectfifo_release( + ObjectFifoPort.Produce, act2_fifo_names[i], 1 + ) + yield_([]) + objectfifo_release( + ObjectFifoPort.Consume, wts_sub_fifo_names[i][0], 1 + ) + yield_([]) + + # 3x3 conv2d OFM 0-31 + for i in range(n_cols): + + @core(cores[i][1], "conv2dk3.o") + def core_body(): + if i == 2: + scale = 9 + else: + scale = 9 + for _ in for_(sys.maxsize): + + # acquire weights and rtps once + element0Weights = wts_sub_fifos[ + wts_sub_fifo_names[i][1] + ].acquire(ObjectFifoPort.Consume, 1) + # scale = memref.load(rtpComputeTile03, 0) + + # pre-amble: top row + elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 2 + ) + element0ActivactionsOut = act3_fifo_1[ + act3_fifo_names_1[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv2dk3, + [ + elementActivactionsIn[0], + elementActivactionsIn[0], + elementActivactionsIn[1], + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit // 2, + 3, + 3, + 0, + scale, + 0, + ], + ) + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_1[i], 1 + ) + + # middle + for _ in for_(tensorInH - 2): + elementActivactionsIn = act2_fifos[ + act2_fifo_names[i] + ].acquire(ObjectFifoPort.Consume, 3) + element0ActivactionsOut = act3_fifo_1[ + act3_fifo_names_1[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv2dk3, + [ + elementActivactionsIn[0], + elementActivactionsIn[1], + elementActivactionsIn[2], + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit // 2, + 3, + 3, + 1, + scale, + 0, + ], + ) + + objectfifo_release( + ObjectFifoPort.Consume, act2_fifo_names[i], 1 + ) + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_1[i], 1 + ) + yield_([]) + + # last part + elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 2 + ) + element0ActivactionsOut = act3_fifo_1[ + act3_fifo_names_1[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv2dk3, + [ + elementActivactionsIn[0], + elementActivactionsIn[1], + elementActivactionsIn[1], + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit // 2, + 3, + 3, + 2, + scale, + 0, + ], + ) + + objectfifo_release( + ObjectFifoPort.Consume, act2_fifo_names[i], 2 + ) + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_1[i], 1 + ) + + objectfifo_release( + ObjectFifoPort.Consume, wts_sub_fifo_names[i][1], 1 + ) + yield_([]) + + # 3x3 conv2d OFM 32-63 + + for i in range(n_cols): + + @core(cores[i][3], "conv2dk3.o") + def core_body(): + if i == 2: + scale = 9 + else: + scale = 9 + for _ in for_(sys.maxsize): + + # acquire weights and rtps once + element0Weights = wts_sub_fifos[ + wts_sub_fifo_names[i][1] + ].acquire(ObjectFifoPort.Consume, 1) + # scale = memref.load(rtpComputeTile05, 0) + + # pre-amble: top row + elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 2 + ) + element0ActivactionsOut = act3_fifo_2[ + act3_fifo_names_2[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv2dk3, + [ + elementActivactionsIn[0], + elementActivactionsIn[0], + elementActivactionsIn[1], + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit // 2, + 3, + 3, + 0, + scale, + tensorInCInit // 2, + ], + ) + + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_2[i], 1 + ) + + # middle + for _ in for_(tensorInH - 2): + elementActivactionsIn = act2_fifos[ + act2_fifo_names[i] + ].acquire(ObjectFifoPort.Consume, 3) + element0ActivactionsOut = act3_fifo_2[ + act3_fifo_names_2[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv2dk3, + [ + elementActivactionsIn[0], + elementActivactionsIn[1], + elementActivactionsIn[2], + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit // 2, + 3, + 3, + 1, + scale, + tensorInCInit // 2, + ], + ) + + objectfifo_release( + ObjectFifoPort.Consume, act2_fifo_names[i], 1 + ) + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_2[i], 1 + ) + yield_([]) + + # last part + elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 2 + ) + element0ActivactionsOut = act3_fifo_2[ + act3_fifo_names_2[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv2dk3, + [ + elementActivactionsIn[0], + elementActivactionsIn[1], + elementActivactionsIn[1], + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit // 2, + 3, + 3, + 2, + scale, + tensorInCInit // 2, + ], + ) + objectfifo_release( + ObjectFifoPort.Consume, act2_fifo_names[i], 2 + ) + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_2[i], 1 + ) + objectfifo_release( + ObjectFifoPort.Consume, wts_sub_fifo_names[i][1], 1 + ) + yield_([]) + + # # 1x1 conv2d and add skip + for i in range(n_cols): + + @core(cores[i][2], conv3_kernels[i]) + def core_body(): + for _ in for_(sys.maxsize): + + # acquire weights and rtps once + element0Weights = wts_sub_fifos[ + wts_sub_fifo_names[i][2] + ].acquire(ObjectFifoPort.Consume, 1) + if i == 0: + scale = memref.load(rtp[0][3], [0]) + skipScale = memref.load(rtp[0][3], [1]) + skipConvScale = memref.load(rtp[0][3], [2]) + else: + scale = memref.load(rtp[i][2], [0]) + skipScale = memref.load(rtp[i][2], [1]) + + for _ in for_(tensorInH): + element0ActivactionsIn = act3_fifo_1[ + act3_fifo_names_1[i] + ].acquire(ObjectFifoPort.Consume, 1) + element1ActivactionsIn = act3_fifo_2[ + act3_fifo_names_2[i] + ].acquire(ObjectFifoPort.Consume, 1) + + elementActivactionsOut = conv3_out_fifo[i].acquire( + ObjectFifoPort.Produce, 1 + ) + elementSkipsIn = skip_fifos[skip_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 1 + ) + if i == 0: + call( + conv3_kernels_call[0], + [ + element0ActivactionsIn, + element1ActivactionsIn, + element0Weights, + elementActivactionsOut, + elementSkipsIn, + tensorInW, + tensorInCInit, + tensorInCRest, + tensorInCInit, + scale, + skipScale, + skipConvScale, + ], + ) + else: + call( + conv3_kernels_call[i], + [ + element0ActivactionsIn, + element1ActivactionsIn, + element0Weights, + elementActivactionsOut, + elementSkipsIn, + tensorInW, + tensorInCInit, + tensorInCRest, + scale, + skipScale, + ], + ) + objectfifo_release( + ObjectFifoPort.Consume, act3_fifo_names_1[i], 1 + ) + objectfifo_release( + ObjectFifoPort.Consume, act3_fifo_names_2[i], 1 + ) + objectfifo_release( + ObjectFifoPort.Produce, conv3_out_fifo_names[i], 1 + ) + + objectfifo_release( + ObjectFifoPort.Consume, skip_fifo_names[i], 1 + ) + yield_([]) + objectfifo_release( + ObjectFifoPort.Consume, wts_sub_fifo_names[i][2], 1 + ) + yield_([]) + + # instruction stream generation + activationsInSize32b = (tensorInW * tensorInH * tensorInCInit) // 4 + acitivationsOutSize32b = (tensorInW * tensorInH * tensorInCRest) // 4 + + totalWeightsSize32b_init = ( + tensorInCInit * tensorInCInit + + 3 * 3 * tensorInCInit * tensorInCInit + + 2 * tensorInCInit * tensorInCRest + ) // 4 + + totalWeightsSize32b_rest = ( + tensorInCInit * tensorInCRest + + 3 * 3 * tensorInCInit * tensorInCInit + + tensorInCInit * tensorInCRest + ) // 4 + + totalWeightsSize32b_complete = ( + totalWeightsSize32b_init + repeat * totalWeightsSize32b_rest + ) + + activationsInL3_ty = MemRefType.get((activationsInSize32b,), int32_ty) + activationsOutL3_ty = MemRefType.get((acitivationsOutSize32b,), int32_ty) + weightsInL3_ty_init = MemRefType.get((totalWeightsSize32b_init,), int32_ty) + weightsInL3_ty_rest = MemRefType.get((totalWeightsSize32b_rest,), int32_ty) + + weightsInL3_ty_complete = MemRefType.get( + (totalWeightsSize32b_complete,), int32_ty + ) + + @FuncOp.from_py_func( + activationsInL3_ty, weightsInL3_ty_complete, activationsOutL3_ty + ) + def sequence(inputFromL3, weightsFromL3, outputToL3): + + # for c, col in enumerate(rtp_name): + # for r, row in enumerate(col): + # NpuWriteRTPOp(row, col=c, row=r + 2, index=0, value=1) # scale + NpuWriteRTPOp("rtpComputeTile02", col=0, row=2, index=0, value=8) + NpuWriteRTPOp("rtpComputeTile03", col=0, row=3, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile04", col=0, row=5, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=0, value=11) + NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=1, value=0) + NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=2, value=7) + + NpuWriteRTPOp("rtpComputeTile15", col=1, row=5, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile14", col=1, row=4, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile12", col=1, row=2, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=0, value=12) + NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=1, value=0) + + NpuWriteRTPOp("rtpComputeTile22", col=2, row=2, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile23", col=2, row=3, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile25", col=2, row=5, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=0, value=12) + NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=1, value=0) + + rtp_1 = [7, 10, 13, -2, 10] + rtp_2 = [8, 10, 12] + rtp_3 = [9, 9, 12] + + npu_dma_memcpy_nd( + metadata="act1_00_02_01", + bd_id=0, + mem=inputFromL3, + sizes=[1, 1, 1, activationsInSize32b], + ) + npu_dma_memcpy_nd( + metadata="outOFL2L3", + bd_id=2, + mem=outputToL3, + sizes=[1, 1, 1, acitivationsOutSize32b], + ) + npu_dma_memcpy_nd( + metadata="wts_0_L3L2", + bd_id=1, + mem=weightsFromL3, + sizes=[1, 1, 1, totalWeightsSize32b_init], + ) + + npu_dma_memcpy_nd( + metadata="wts_1_L3L2", + bd_id=1, + mem=weightsFromL3, + offsets=[0, 0, 0, totalWeightsSize32b_init], + sizes=[1, 1, 1, totalWeightsSize32b_rest], + ) + + npu_dma_memcpy_nd( + metadata="wts_2_L3L2", + bd_id=1, + mem=weightsFromL3, + offsets=[ + 0, + 0, + 0, + totalWeightsSize32b_init + totalWeightsSize32b_rest, + ], + sizes=[1, 1, 1, totalWeightsSize32b_rest], + ) + + npu_sync(column=1, row=0, direction=0, channel=0) + + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) + + +resnet_conv_x() diff --git a/programming_examples/ml/resnet/ptq_conv2x/data/cifar10_label_map.txt b/programming_examples/ml/resnet/ptq_conv2x/data/cifar10_label_map.txt new file mode 100644 index 0000000000..1fc508024c --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/data/cifar10_label_map.txt @@ -0,0 +1 @@ +{"0": "airplane", "1": "automobile", "2": "bird", "3": "cat", "4": "deer", "5": "dog", "6": "frog", "7": "horse", "8": "ship", "9": "truck"} \ No newline at end of file diff --git a/programming_examples/ml/resnet/ptq_conv2x/model.py b/programming_examples/ml/resnet/ptq_conv2x/model.py new file mode 100644 index 0000000000..a980782021 --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/model.py @@ -0,0 +1,158 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class CombinedModel(nn.Module): + def __init__(self, first, aie, post): + super(CombinedModel, self).__init__() + self.first = first + self.aie = aie + self.post = post + + def forward(self, x): + x = self.first(x) + x = self.aie(x) + x = self.post(x) + return x + + +class PreAIELayers(nn.Module): + def __init__(self): + super(PreAIELayers, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64) + + def forward(self, x): + out = self.conv1(x) + out = self.bn1(out) + # print( out) + out = F.relu(out) + return out + + +class AIEConv2xOffload(nn.Module): + def __init__(self, block, num_blocks): + super(AIEConv2xOffload, self).__init__() + self.in_planes = 64 + self.layer1 = block(in_planes=64, planes=64) + self.layer2 = block(in_planes=256, planes=64) + self.layer3 = block(in_planes=256, planes=64) + + def forward(self, x): + out = self.layer1(x) + out = self.layer2(out) + out = self.layer3(out) + return out + + +class PostAIELayers(nn.Module): + def __init__(self, block, num_blocks, num_classes): + super(PostAIELayers, self).__init__() + + self.in_planes = 256 + self.layer2 = self._make_layer(block, 128, num_blocks[0], stride=2) + self.layer3 = self._make_layer(block, 256, num_blocks[1], stride=2) + self.layer4 = self._make_layer(block, 512, num_blocks[2], stride=2) + self.linear = nn.Linear(512 * block.expansion, num_classes) + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1] * (num_blocks - 1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x): + out = self.layer2(x) + out = self.layer3(out) + out = self.layer4(out) + out = F.avg_pool2d(out, 32) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +class Bottleneck_projected(nn.Module): + expansion = 4 + + def __init__(self, in_planes, planes, stride=1, option="A"): + super(Bottleneck_projected, self).__init__() + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d( + planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False + ) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d( + planes, self.expansion * planes, kernel_size=1, bias=False + ) + self.bn3 = nn.BatchNorm2d(self.expansion * planes) + self.relu1 = nn.ReLU() + self.relu2 = nn.ReLU() + self.relu3 = nn.ReLU() + + self.shortcut = nn.Sequential() + if in_planes != self.expansion * planes: + self.shortcut = nn.Sequential( + nn.Conv2d( + in_planes, self.expansion * planes, kernel_size=1, bias=False + ), + nn.BatchNorm2d(self.expansion * planes), + ) + + def forward(self, x): + out = self.relu1(self.bn1(self.conv1(x))) + out = self.relu2(self.bn2(self.conv2(out))) + out = self.bn3(self.conv3(out)) + out = out + self.shortcut(x) + out = self.relu3(out) + return out + + +class Bottleneck_fused_projected(nn.Module): + expansion = 4 + + def __init__(self, in_planes, planes, stride=1, option="A"): + super(Bottleneck_fused_projected, self).__init__() + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) + + self.conv2 = nn.Conv2d( + planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False + ) + + self.conv3 = nn.Conv2d( + planes, self.expansion * planes, kernel_size=1, bias=False + ) + + self.relu1 = nn.ReLU() + self.relu2 = nn.ReLU() + self.relu3 = nn.ReLU() + + self.shortcut = nn.Sequential() + if in_planes != self.expansion * planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, bias=False) + ) + + def forward(self, x): + out = self.relu1((self.conv1(x))) + out = self.relu2((self.conv2(out))) + out = self.conv3(out) + out += self.shortcut(x) + out = self.relu3(out) + return out + + +def Resnet50_conv2x_offload(num_classes): + return CombinedModel( + PreAIELayers(), + AIEConv2xOffload( + Bottleneck_fused_projected, + [ + 1, + ], + ), + PostAIELayers(Bottleneck_projected, [4, 6, 3], num_classes), + ) diff --git a/programming_examples/ml/resnet/ptq_conv2x/requirements.txt b/programming_examples/ml/resnet/ptq_conv2x/requirements.txt new file mode 100644 index 0000000000..47a9883564 --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/requirements.txt @@ -0,0 +1,4 @@ +brevitas +torchvision +tqdm +opencv-python \ No newline at end of file diff --git a/programming_examples/ml/resnet/ptq_conv2x/run_makefile.lit b/programming_examples/ml/resnet/ptq_conv2x/run_makefile.lit new file mode 100644 index 0000000000..4d506d9c43 --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess, torch, dontrun +// +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile +// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s +// CHECK: PASS! diff --git a/programming_examples/ml/resnet/ptq_conv2x/test.py b/programming_examples/ml/resnet/ptq_conv2x/test.py new file mode 100755 index 0000000000..175cfb8fab --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/test.py @@ -0,0 +1,487 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. + +import torch +import torch.nn as nn +import sys +import math +from aie.utils.ml import DataShaper +import time +import os +import numpy as np +import model as res + +from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute +import aie.utils.test as test_utils + +torch.use_deterministic_algorithms(True) +torch.manual_seed(0) +from utils import unpickle, load_class_label +import torchvision +from torchvision import transforms +from PIL import Image +from brevitas.nn import QuantConv2d, QuantIdentity, QuantReLU +from brevitas.quant.fixed_point import ( + Int8ActPerTensorFixedPoint, + Int8WeightPerTensorFixedPoint, + Uint8ActPerTensorFixedPoint, +) +from brevitas.graph.target.flexml import preprocess_for_flexml_quantize +from brevitas_examples.imagenet_classification.ptq.ptq_common import quantize_model +import torch.utils.data as data_utils +from brevitas_examples.imagenet_classification.ptq.ptq_common import calibrate +from brevitas_examples.imagenet_classification.ptq.ptq_common import calibrate_bn +from brevitas_examples.imagenet_classification.utils import generate_dataloader +from brevitas_examples.imagenet_classification.utils import SEED +from brevitas_examples.imagenet_classification.utils import validate + + +def main(opts): + design = "resnet_conv2_x_int8" + xclbin_path = opts.xclbin + insts_path = opts.instr + + log_folder = "log/" + if not os.path.exists(log_folder): + os.makedirs(log_folder) + + num_iter = 1 + npu_time_total = 0 + npu_time_min = 9999999 + npu_time_max = 0 + trace_size = 16384 + enable_trace = False + trace_file = "log/trace_" + design + ".txt" + # ------------------------------------------------------ + # Configure this to match your design's buffer size + # ------------------------------------------------------ + dtype_in = np.dtype("int8") + dtype_wts = np.dtype("int8") + dtype_out = np.dtype("uint8") + + shape_in_act = (32, 8, 32, 8) + shape_total_wts = (212992, 1) + shape_out = (32, 32, 32, 8) + + # ------------------------------------------------------ + # Post training quantization to get int8 weights and activation for AIE + # ------------------------------------------------------ + # Step 1: Load the pre-trained ResNet model + num_classes = 10 + model = res.Resnet50_conv2x_offload(num_classes) + weights = "trained_resnet50/weight.tar" # trained FP model + saved_model_dict = torch.load(weights, map_location=torch.device("cpu")) + model.load_state_dict(saved_model_dict) + + data_dir = "data" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + transform = transforms.Compose( + [ + transforms.Pad(4), + transforms.RandomHorizontalFlip(), + transforms.RandomCrop(32), + transforms.ToTensor(), + ] + ) + transform_train = transforms.Compose( + [ + transforms.RandomHorizontalFlip(), + transforms.RandomCrop(32, padding=4), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ] + ) + transform_test = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ] + ) + + # CIFAR-10 dataset + train_dataset = torchvision.datasets.CIFAR10( + root=data_dir, train=True, transform=transform_train, download=True + ) + test_dataset = torchvision.datasets.CIFAR10( + root=data_dir, train=False, transform=transform_test, download=True + ) + + # Data loader for calibration + indices = torch.arange(256) + tr_sub = data_utils.Subset(train_dataset, indices) + val_sub = data_utils.Subset(test_dataset, indices) + calib_loader = torch.utils.data.DataLoader( + dataset=tr_sub, batch_size=64, shuffle=True + ) + val_loader = torch.utils.data.DataLoader( + dataset=val_sub, batch_size=64, shuffle=False + ) + + # Step 2: Apply quantization to the conv2_x layers to convert weights to 8-bit precision + img_shape = 32 + model_aie = preprocess_for_flexml_quantize( + model.aie, + torch.ones(1, 64, img_shape, img_shape), + equalize_iters=1000, + equalize_merge_bias=True, + merge_bn=True, + ) + + quant_model = quantize_model( + model_aie, + backend="flexml", + scale_factor_type="po2_scale", # Ensuring scale factors are powers of two + bias_bit_width=32, + weight_bit_width=8, + weight_narrow_range=False, + weight_param_method="stats", + weight_quant_granularity="per_tensor", + weight_quant_type="sym", + layerwise_first_last_bit_width=8, + act_bit_width=8, + act_param_method="stats", + act_quant_percentile=99.999, + act_quant_type="sym", + quant_format="int", + layerwise_first_last_mantissa_bit_width=4, + layerwise_first_last_exponent_bit_width=3, + weight_mantissa_bit_width=4, + weight_exponent_bit_width=3, + act_mantissa_bit_width=4, + act_exponent_bit_width=3, + ) + + model.aie = quant_model + model.eval() + print("Starting post training quantization:") + calibrate(calib_loader, model) + model.eval() + device, dtype = ( + next(model.parameters()).device, + next(model.parameters()).dtype, + ) + # ----------------------- + + from numpy import load + + # Extracting quantized weights and scale factors + params = {} + weights = {} + for name, module in model.named_modules(): + if isinstance(module, QuantConv2d): + weights[name + ".int_weight"] = module.quant_weight().int( + float_datatype=False + ) + params[name + "_scale"] = module.quant_weight().scale.detach().numpy() + if isinstance(module, QuantIdentity): + params[name + "_scale"] = module.quant_act_scale() + if isinstance(module, QuantReLU): + params[name + "_scale"] = module.quant_act_scale() + np.savez(os.path.join(os.getcwd(), "int_weights.npz"), **weights) + np.savez(os.path.join(os.getcwd(), "int_conv_scale.npz"), **params) + int_wts_data = load("int_weights.npz", allow_pickle=True) + int_scale_data = load("int_conv_scale.npz", allow_pickle=True) + + # Loading weights and scales + int_wts_data_lst = int_wts_data.files + block_0_int_weight_1 = torch.from_numpy(int_wts_data["aie.layer1.conv1.int_weight"]) + block_0_int_weight_2 = torch.from_numpy(int_wts_data["aie.layer1.conv2.int_weight"]) + block_0_int_weight_3 = torch.from_numpy(int_wts_data["aie.layer1.conv3.int_weight"]) + block_0_int_weight_skip = torch.from_numpy( + int_wts_data["aie.layer1.shortcut.0.int_weight"] + ) + + block_1_int_weight_1 = torch.from_numpy(int_wts_data["aie.layer2.conv1.int_weight"]) + block_1_int_weight_2 = torch.from_numpy(int_wts_data["aie.layer2.conv2.int_weight"]) + block_1_int_weight_3 = torch.from_numpy(int_wts_data["aie.layer2.conv3.int_weight"]) + + block_2_int_weight_1 = torch.from_numpy(int_wts_data["aie.layer3.conv1.int_weight"]) + block_2_int_weight_2 = torch.from_numpy(int_wts_data["aie.layer3.conv2.int_weight"]) + block_2_int_weight_3 = torch.from_numpy(int_wts_data["aie.layer3.conv3.int_weight"]) + + int_scale_data_lst = int_scale_data.files + + init_scale = int_scale_data["aie.x_quant_scale"] + block_0_relu_1 = int_scale_data["aie.layer1.relu1_scale"] + block_0_relu_2 = int_scale_data["aie.layer1.relu2_scale"] + block_0_relu_3 = int_scale_data["aie.layer1.relu3_scale"] + block_0_add_scale = int_scale_data["aie.add_quant_scale"] + + block_0_weight_scale_1 = int_scale_data["aie.layer1.conv1_scale"] + block_0_weight_scale_2 = int_scale_data["aie.layer1.conv2_scale"] + block_0_weight_scale_3 = int_scale_data["aie.layer1.conv3_scale"] + block_0_weight_scale_skip = int_scale_data["aie.layer1.shortcut.0_scale"] + + block_1_relu_1 = int_scale_data["aie.layer2.relu1_scale"] + block_1_relu_2 = int_scale_data["aie.layer2.relu2_scale"] + block_1_relu_3 = int_scale_data["aie.layer2.relu3_scale"] + block_1_add_scale = int_scale_data["aie.add_1_quant_scale"] + + block_1_weight_scale_1 = int_scale_data["aie.layer2.conv1_scale"] + block_1_weight_scale_2 = int_scale_data["aie.layer2.conv2_scale"] + block_1_weight_scale_3 = int_scale_data["aie.layer2.conv3_scale"] + + block_2_relu_1 = int_scale_data["aie.layer3.relu1_scale"] + block_2_relu_2 = int_scale_data["aie.layer3.relu2_scale"] + block_2_relu_3 = int_scale_data["aie.layer3.relu3_scale"] + block_2_add_scale = int_scale_data["aie.add_2_quant_scale"] + + block_2_weight_scale_1 = int_scale_data["aie.layer3.conv1_scale"] + block_2_weight_scale_2 = int_scale_data["aie.layer3.conv2_scale"] + block_2_weight_scale_3 = int_scale_data["aie.layer3.conv3_scale"] + + for name, param in model.named_parameters(): + if name.endswith(".bias"): + param.data.fill_(0) + + # Calculate combined scales + block_0_combined_scale1 = -math.log( + init_scale * block_0_weight_scale_1 / block_0_relu_1, 2 + ) # after conv1x1 + block_0_combined_scale2 = -math.log( + block_0_relu_1 * block_0_weight_scale_2 / block_0_relu_2, 2 + ) # after conv3x3 + block_0_combined_scale3 = -math.log( + block_0_relu_2 * block_0_weight_scale_3 / block_0_add_scale, 2 + ) # after conv1x1 + block_0_combined_scale4 = -math.log( + block_0_add_scale / block_0_relu_3, 2 + ) # after skip addition using init scale + # combined_scale4=-math.log(inp_scale1/inp_scale4) + block_0_combined_scale_skip = -math.log( + init_scale * block_0_weight_scale_skip / block_0_add_scale, 2 + ) # after LHS conv1x1 + + block_1_combined_scale1 = -math.log( + block_0_relu_3 * block_1_weight_scale_1 / block_1_relu_1, 2 + ) # after conv1x1 + block_1_combined_scale2 = -math.log( + block_1_relu_1 * block_1_weight_scale_2 / block_1_relu_2, 2 + ) # after conv3x3 + block_1_combined_scale3 = -math.log( + block_1_relu_2 * block_1_weight_scale_3 / block_1_add_scale, 2 + ) # after conv1x1 + block_1_combined_scale4 = -math.log( + block_1_add_scale / block_1_relu_3, 2 + ) # after skip addition using init scale + + block_2_combined_scale1 = -math.log( + block_1_relu_3 * block_2_weight_scale_1 / block_2_relu_1, 2 + ) # RHS after first conv1x1 | clip 0-->255 + block_2_combined_scale2 = -math.log( + block_2_relu_1 * block_2_weight_scale_2 / block_2_relu_2, 2 + ) # RHS after second conv3x3 | clip 0-->255 + block_2_combined_scale3 = -math.log( + block_2_relu_2 * block_2_weight_scale_3 / block_2_add_scale, 2 + ) # RHS after third conv1x1 | clip -128-->+127 + block_2_combined_scale4 = -math.log( + block_2_add_scale / block_2_relu_3, 2 + ) # After addition | clip 0-->255 + + print("--------------------------------------------------------------") + print("Block0 combined_scale after first conv1x1:", block_0_combined_scale1) + print("Block0 combined_scale after second conv3x3:", block_0_combined_scale2) + print("Block0 combined_scale after third conv1x1:", block_0_combined_scale3) + print( + "Block0 combined_scale after adding skip connection:", (block_0_combined_scale4) + ) + print("Block0 combined_scale after skip conv1x1:", block_0_combined_scale_skip) + + print("--------------------------------------------------------------") + print("Block1 combined_scale after first conv1x1:", block_1_combined_scale1) + print("Block1 combined_scale after second conv3x3:", block_1_combined_scale2) + print("Block1 combined_scale after third conv1x1:", block_1_combined_scale3) + print( + "Block1 combined_scale after adding skip connection:", (block_1_combined_scale4) + ) + print("--------------------------------------------------------------") + print("Block2 combined_scale block2 after first conv1x1:", block_2_combined_scale1) + print("Block2 combined_scale block2 after second conv3x3:", block_2_combined_scale2) + print("Block2 combined_scale block2 after third conv1x1:", block_2_combined_scale3) + print( + "Block2 combined_scale block2 after adding skip connection:", + (block_2_combined_scale4), + ) + print("------------------------------------------------------------------") + # ------------------------------------------------------ + # Get device, load the xclbin & kernel and register them + # ------------------------------------------------------ + app = setup_aie( + xclbin_path, + insts_path, + shape_in_act, + dtype_in, + shape_total_wts, + dtype_wts, + shape_out, + dtype_out, + enable_trace=enable_trace, + trace_size=trace_size, + ) + # ------------------------------------------------------ + # Reorder input data-layout + # ------------------------------------------------------ + ds = DataShaper() + + block0_wts1 = ds.reorder_mat( + block_0_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block0_wts2 = ds.reorder_mat( + block_0_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block0_wts3 = ds.reorder_mat( + block_0_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block0_wts_skip = ds.reorder_mat( + block_0_int_weight_skip.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + + total_wts = np.concatenate( + (block0_wts1, block0_wts2, block0_wts3, block0_wts_skip), axis=None + ) + + block1_wts1 = ds.reorder_mat( + block_1_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block1_wts2 = ds.reorder_mat( + block_1_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block1_wts3 = ds.reorder_mat( + block_1_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + + total_wts2 = np.concatenate( + (total_wts, block1_wts1, block1_wts2, block1_wts3), axis=None + ) + + block2_wts1 = ds.reorder_mat( + block_2_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block2_wts2 = ds.reorder_mat( + block_2_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block2_wts3 = ds.reorder_mat( + block_2_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + + total_wts3 = np.concatenate( + (total_wts2, block2_wts1, block2_wts2, block2_wts3), axis=None + ) + + total_wts3.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") + + import time + import cv2 + + predicted_label = [None] * 64 + cpu_predicted_label = [None] * 64 + aie_time = [None] * 64 + metafile = r"./data/cifar-10-batches-py/batches.meta" + datafile = r"./data/cifar-10-batches-py/test_batch" + data_batch_1 = unpickle(datafile) + metadata = unpickle(metafile) + images = data_batch_1["data"] + labels = data_batch_1["labels"] + images = np.reshape(images, (10000, 3, 32, 32)) + dirname = "cifar_images" + if not os.path.exists(dirname): + os.mkdir(dirname) + + # Extract and dump first 10 images + for i in range(0, 100): + im = images[i] + im = im.transpose(1, 2, 0) + im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR) + im_name = f"./cifar_images/image_{i}.png" + cv2.imwrite(im_name, im) + + label_path = "data/cifar10_label_map.txt" + model_num_classes = 10 + class_label_map = load_class_label(label_path, model_num_classes) + quant_id_1 = QuantIdentity( + act_quant=Uint8ActPerTensorFixedPoint, bit_width=8, return_quant_tensor=True + ) + quant_id_1.eval() + + # ------------------------------------------------------ + # Main run loop + # ------------------------------------------------------ + + for i in range(0, 64): + print( + "____________________________________IMAGE {}____________________________________________".format( + i + ) + ) + image_name = f"./cifar_images/image_{i}.png" + img = Image.open(image_name) + input_tensor = transform_test(img) + input_batch = input_tensor.unsqueeze(0) + with torch.no_grad(): + # print(input_batch.shape + start = time.time() * 1000 + output1 = model.first(input_batch) + + # AIE OFFLOAD + qnt_inp = model.aie.x_quant(output1) + int_inp = model.aie.x_quant(output1).int(float_datatype=True) + before_input = int_inp.squeeze().data.numpy().astype(dtype_in) + ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") + start = time.time_ns() + aie_output = execute(app, ifm_mem_fmt, total_wts3) * block_2_relu_3 + stop = time.time_ns() + temp_out = aie_output.reshape(32, 32, 32, 8) + temp2_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") + ofm_mem_fmt = temp2_out.reshape(256, 32, 32) + ofm_mem_fmt = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) + final_output_aie = model.post(ofm_mem_fmt) + + # ------------------------------------------------------------------------------ + # Baseline output for functional correctness + output_golden = model.aie(output1) + max_error = torch.max(torch.abs(ofm_mem_fmt - output_golden)) + # print(max_error) + final_output_base = model.post(output_golden) + predicted_class = np.argmax(final_output_aie) + predicted_label[i] = metadata["label_names"][predicted_class] + cpu_predicted_class = np.argmax(final_output_base) + cpu_predicted_label[i] = metadata["label_names"][cpu_predicted_class] + label = metadata["label_names"][labels[i]] + print( + f" Predicted AIE: {predicted_label[i]}, Predicted CPU: {predicted_label[i]}" + ) + + # Calculate the five categories with the highest classification probability + prediction_class_index = ( + torch.topk(final_output_aie, k=5, sorted=True) + .indices.squeeze(0) + .tolist() + ) + golden_prediction_class_index = ( + torch.topk(final_output_base, k=5, sorted=True) + .indices.squeeze(0) + .tolist() + ) + npu_time = stop - start + npu_time_total = npu_time_total + npu_time + + # ------------------------------------------------------ + # Compare the AIE output and the golden reference + # ------------------------------------------------------ + print("\nAvg NPU time: {}us.".format(int((npu_time_total / 64) / 1000))) + for x, y in zip(predicted_label, predicted_label): + if x != y: + print("\nFailed.\n") + exit(-1) + print("\nPASS!\n") + exit(0) + + +if __name__ == "__main__": + p = test_utils.create_default_argparser() + opts = p.parse_args(sys.argv[1:]) + main(opts) diff --git a/programming_examples/ml/resnet/ptq_conv2x/utils.py b/programming_examples/ml/resnet/ptq_conv2x/utils.py new file mode 100644 index 0000000000..8ac8e93b8d --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/utils.py @@ -0,0 +1,41 @@ +import json +import cv2 +import numpy as np + + +def unpickle(file): + import pickle + + with open(file, "rb") as fo: + dict = pickle.load(fo, encoding="latin1") + return dict + + +def load_class_label(class_label_file: str, num_classes: int) -> list: + class_label = json.load(open(class_label_file)) + class_label_list = [class_label[str(i)] for i in range(num_classes)] + + return class_label_list + + +def extract_cifar(): + data_batch_1 = unpickle(datafile) + metadata = unpickle(metafile) + + images = data_batch_1["data"] + labels = data_batch_1["labels"] + images = np.reshape(images, (10000, 3, 32, 32)) + + import os + + dirname = "cifar_images" + if not os.path.exists(dirname): + os.mkdir(dirname) + + # Extract and dump first 10 images + for i in range(0, 100): + im = images[i] + im = im.transpose(1, 2, 0) + im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR) + im_name = f"./cifar_images/image_{i}.png" + cv2.imwrite(im_name, im)