From 3ad06df28a03b62e3985e0a434071db710d372ff Mon Sep 17 00:00:00 2001 From: Gagandeep Singh Date: Sat, 27 Apr 2024 03:28:34 -0600 Subject: [PATCH 1/8] initial ptq with conv2x --- .../ml/resnet/ptq_conv2x/CMakeLists.txt | 89 ++ .../ml/resnet/ptq_conv2x/Makefile | 49 + .../ml/resnet/ptq_conv2x/aie2.py | 1007 +++++++++++++++++ .../ml/resnet/ptq_conv2x/test.py | 489 ++++++++ 4 files changed, 1634 insertions(+) create mode 100755 programming_examples/ml/resnet/ptq_conv2x/CMakeLists.txt create mode 100755 programming_examples/ml/resnet/ptq_conv2x/Makefile create mode 100755 programming_examples/ml/resnet/ptq_conv2x/aie2.py create mode 100755 programming_examples/ml/resnet/ptq_conv2x/test.py diff --git a/programming_examples/ml/resnet/ptq_conv2x/CMakeLists.txt b/programming_examples/ml/resnet/ptq_conv2x/CMakeLists.txt new file mode 100755 index 0000000000..c7db0e9c5c --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/CMakeLists.txt @@ -0,0 +1,89 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. + +# parameters +# -DBOOST_ROOT: Path to Boost install +# -DOpenCV_DIR: Path to OpenCV install +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib +# -DTARGET_NAME: Target name to be built + +# cmake needs this line +cmake_minimum_required(VERSION 3.1) + +find_program(WSL NAMES powershell.exe) + +if (NOT WSL) + set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") + set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install") + set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") +else() + set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") + set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install") + set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") +endif () + +set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width") +set(EDGEDETECT_HEIGHT 1080 CACHE STRING "image height") + +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName ${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) +endif () + +project(${ProjectName}) + +# Find packages +find_package(Boost REQUIRED) +find_package(OpenCV REQUIRED) +message("opencv library paht: ${OpenCV_LIB_PATH}") +message("opencv libs: ${OpenCV_LIBS}") + + +add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/OpenCVUtils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/xrtUtils.cpp + test.cpp +) + +target_compile_definitions(${currentTarget} PUBLIC + EDGEDETECT_WIDTH=${EDGEDETECT_WIDTH} + EDGEDETECT_HEIGHT=${EDGEDETECT_HEIGHT} + DISABLE_ABI_CHECK=1 + ) + +target_include_directories (${currentTarget} PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils + ${XRT_INC_DIR} + ${OpenCV_INCLUDE_DIRS} + ${Boost_INCLUDE_DIRS} +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} + ${OpenCV_LIB_PATH} + ${Boost_LIBRARY_DIRS} +) + +if (NOT WSL) + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ${OpenCV_LIBS} + boost_program_options + boost_filesystem + ) +else() + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ${OpenCV_LIBS} + ) +endif() diff --git a/programming_examples/ml/resnet/ptq_conv2x/Makefile b/programming_examples/ml/resnet/ptq_conv2x/Makefile new file mode 100755 index 0000000000..4b40c07da9 --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/Makefile @@ -0,0 +1,49 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +include ${srcdir}/../../../makefile-common + +mlirFileName = aie + +VPATH := ${srcdir}/../../../../aie_kernels/aie2 + +all: build/conv2dk1_i8.o build/conv2dk1_skip_init.o build/conv2dk3.o build/conv2dk1_skip.o build/conv2dk1_ui8.o build/final.xclbin + +build/${mlirFileName}.mlir: ${srcdir}/aie2.py + mkdir -p ${@D} + python3 $< > $@ +insts.txt: build/${mlirFileName}.mlir + aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $< + +build/conv2dk1_i8.o: conv2dk1.cc + xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@ + +build/conv2dk3.o: conv2dk3.cc + xchesscc -d ${CHESSCC2_FLAGS} -DUINT8_ACT -c $< -o $@ + +build/conv2dk1_skip_init.o: conv2dk1_skip_init.cc + xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@ + +build/conv2dk1_ui8.o: conv2dk1.cc + xchesscc -d ${CHESSCC2_FLAGS} -DUINT8_ACT -c $< -o $@ + +build/conv2dk1_skip.o: conv2dk1_skip.cc + xchesscc -d ${CHESSCC2_FLAGS} -DUINT8_ACT -c $< -o $@ + +build/final.xclbin: build/${mlirFileName}.mlir build/conv2dk1_i8.o build/conv2dk1_skip_init.o build/conv2dk3.o build/conv2dk1_skip.o build/conv2dk1_ui8.o + cd build && aiecc.py --basic-alloc-scheme --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts.txt ${ 3x3 + act2_fifos[act2_fifo_names[i]] = object_fifo( + act2_fifo_names[i], + cores[i][0], + [cores[i][3], cores[i][1]], + 4, + tensorLayer1Out_ty, + ) + + # 3x3 -> 1x1 + act3_fifo_1[act3_fifo_names_1[i]] = object_fifo( + act3_fifo_names_1[i], + cores[i][1], + cores[i][2], + 2, + tensorLayer2Out_ty, + ) + # 3x3 -> 1x1 + act3_fifo_2[act3_fifo_names_2[i]] = object_fifo( + act3_fifo_names_2[i], + cores[i][3], + cores[i][2], + 2, + tensorLayer2Out_ty, + ) + else: + # 1x1 -> 3x3 + act2_fifos[act2_fifo_names[i]] = object_fifo( + act2_fifo_names[i], + cores[i][0], + [cores[i][1], cores[i][3]], + 4, + tensorLayer1Out_ty, + ) + + # 3x3 -> 1x1 + act3_fifo_1[act3_fifo_names_1[i]] = object_fifo( + act3_fifo_names_1[i], + cores[i][1], + cores[i][2], + 2, + tensorLayer2Out_ty, + ) + # 3x3 -> 1x1 + act3_fifo_2[act3_fifo_names_2[i]] = object_fifo( + act3_fifo_names_2[i], + cores[i][3], + cores[i][2], + 2, + tensorLayer2Out_ty, + ) + wts_fifo_names = ["wts_0_L3L2", "wts_1_L3L2", "wts_2_L3L2"] + wts_fifos = {} + wts_sub_fifo_names = [ + ["wts_buf_00", "wts_buf_01", "wts_buf_02"], + ["wts_buf_10", "wts_buf_11", "wts_buf_12"], + ["wts_buf_20", "wts_buf_21", "wts_buf_22"], + ] + wts_sub_fifos = {} + + for i in range(n_cols): + + wts_fifos[wts_fifo_names[i]] = object_fifo( + wts_fifo_names[i], shims[i], mems[i], 1, wts_sizes[i] + ) + wts_sub_fifos[wts_sub_fifo_names[i][0]] = object_fifo( + wts_sub_fifo_names[i][0], + mems[i], + cores[i][0], + 1, + layer1_wts_sizes[i], + ) + if i == 1: + wts_sub_fifos[wts_sub_fifo_names[i][1]] = object_fifo( + wts_sub_fifo_names[i][1], + mems[i], + [cores[i][3], cores[i][1]], + 1, + weightsLayer2_ty, + ) + + else: + wts_sub_fifos[wts_sub_fifo_names[i][1]] = object_fifo( + wts_sub_fifo_names[i][1], + mems[i], + [cores[i][1], cores[i][3]], + 1, + weightsLayer2_ty, + ) + wts_sub_fifos[wts_sub_fifo_names[i][2]] = object_fifo( + wts_sub_fifo_names[i][2], + mems[i], + cores[i][2], + 1, + layer3_wts_sizes[i], + ) + object_fifo_link( + wts_fifo_names[i], + [ + wts_sub_fifo_names[i][0], + wts_sub_fifo_names[i][1], + wts_sub_fifo_names[i][2], + ], + ) + # output tensor + outOFL2L3 = object_fifo( + "outOFL2L3", cores[2][2], shims[1], 2, tensorLayer3Out_ty + ) + conv3_out_fifo = [ + act1_fifos[act1_fifo_names[1]], + act1_fifos[act1_fifo_names[2]], + outOFL2L3, + ] + conv3_out_fifo_names = ["act1_04_15_11", "act1_13_22_21", "outOFL2L3"] + # # 1x1 conv2d + for i in range(n_cols): + + @core(cores[i][0], conv1_kernels[i]) + def core_body(): + for _ in for_(sys.maxsize): + + # acquire weights once + element0Weights = wts_sub_fifos[ + wts_sub_fifo_names[i][0] + ].acquire(ObjectFifoPort.Consume, 1) + scale = memref.load(rtp[i][0], [0]) + for _ in for_(tensorInH): + element0ActivactionsIn = act1_fifos[ + act1_fifo_names[i] + ].acquire(ObjectFifoPort.Consume, 1) + element0ActivactionsOut = act2_fifos[ + act2_fifo_names[i] + ].acquire(ObjectFifoPort.Produce, 1) + if i == 0: + res = call( + conv1_kernels_call[i], + [ + element0ActivactionsIn, + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit, + scale, + ], + ) + else: + res = call( + conv1_kernels_call[i], + [ + element0ActivactionsIn, + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCRest, + tensorInCInit, + scale, + ], + ) + + objectfifo_release( + ObjectFifoPort.Consume, act1_fifo_names[i], 1 + ) + + objectfifo_release( + ObjectFifoPort.Produce, act2_fifo_names[i], 1 + ) + yield_([]) + objectfifo_release( + ObjectFifoPort.Consume, wts_sub_fifo_names[i][0], 1 + ) + yield_([]) + + # 3x3 conv2d OFM 0-31 + for i in range(n_cols): + + @core(cores[i][1], "conv2dk3.o") + def core_body(): + scale = 11 + for _ in for_(sys.maxsize): + + # acquire weights and rtps once + element0Weights = wts_sub_fifos[ + wts_sub_fifo_names[i][1] + ].acquire(ObjectFifoPort.Consume, 1) + # scale = memref.load(rtpComputeTile03, 0) + + # pre-amble: top row + elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 2 + ) + element0ActivactionsOut = act3_fifo_1[ + act3_fifo_names_1[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv2dk3, + [ + elementActivactionsIn[0], + elementActivactionsIn[0], + elementActivactionsIn[1], + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit // 2, + 3, + 3, + 0, + scale, + 0, + ], + ) + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_1[i], 1 + ) + + # middle + for _ in for_(tensorInH - 2): + elementActivactionsIn = act2_fifos[ + act2_fifo_names[i] + ].acquire(ObjectFifoPort.Consume, 3) + element0ActivactionsOut = act3_fifo_1[ + act3_fifo_names_1[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv2dk3, + [ + elementActivactionsIn[0], + elementActivactionsIn[1], + elementActivactionsIn[2], + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit // 2, + 3, + 3, + 1, + scale, + 0, + ], + ) + + objectfifo_release( + ObjectFifoPort.Consume, act2_fifo_names[i], 1 + ) + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_1[i], 1 + ) + yield_([]) + + # last part + elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 2 + ) + element0ActivactionsOut = act3_fifo_1[ + act3_fifo_names_1[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv2dk3, + [ + elementActivactionsIn[0], + elementActivactionsIn[1], + elementActivactionsIn[1], + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit // 2, + 3, + 3, + 2, + scale, + 0, + ], + ) + + objectfifo_release( + ObjectFifoPort.Consume, act2_fifo_names[i], 2 + ) + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_1[i], 1 + ) + + objectfifo_release( + ObjectFifoPort.Consume, wts_sub_fifo_names[i][1], 1 + ) + yield_([]) + + # 3x3 conv2d OFM 32-63 + + for i in range(n_cols): + + @core(cores[i][3], "conv2dk3.o") + def core_body(): + scale = 11 + for _ in for_(sys.maxsize): + + # acquire weights and rtps once + element0Weights = wts_sub_fifos[ + wts_sub_fifo_names[i][1] + ].acquire(ObjectFifoPort.Consume, 1) + # scale = memref.load(rtpComputeTile05, 0) + + # pre-amble: top row + elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 2 + ) + element0ActivactionsOut = act3_fifo_2[ + act3_fifo_names_2[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv2dk3, + [ + elementActivactionsIn[0], + elementActivactionsIn[0], + elementActivactionsIn[1], + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit // 2, + 3, + 3, + 0, + scale, + tensorInCInit // 2, + ], + ) + + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_2[i], 1 + ) + + # middle + for _ in for_(tensorInH - 2): + elementActivactionsIn = act2_fifos[ + act2_fifo_names[i] + ].acquire(ObjectFifoPort.Consume, 3) + element0ActivactionsOut = act3_fifo_2[ + act3_fifo_names_2[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv2dk3, + [ + elementActivactionsIn[0], + elementActivactionsIn[1], + elementActivactionsIn[2], + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit // 2, + 3, + 3, + 1, + scale, + tensorInCInit // 2, + ], + ) + + objectfifo_release( + ObjectFifoPort.Consume, act2_fifo_names[i], 1 + ) + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_2[i], 1 + ) + yield_([]) + + # last part + elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 2 + ) + element0ActivactionsOut = act3_fifo_2[ + act3_fifo_names_2[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv2dk3, + [ + elementActivactionsIn[0], + elementActivactionsIn[1], + elementActivactionsIn[1], + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit // 2, + 3, + 3, + 2, + scale, + tensorInCInit // 2, + ], + ) + objectfifo_release( + ObjectFifoPort.Consume, act2_fifo_names[i], 2 + ) + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_2[i], 1 + ) + objectfifo_release( + ObjectFifoPort.Consume, wts_sub_fifo_names[i][1], 1 + ) + yield_([]) + + # # 1x1 conv2d and add skip + for i in range(n_cols): + + @core(cores[i][2], conv3_kernels[i]) + def core_body(): + for _ in for_(sys.maxsize): + + # acquire weights and rtps once + element0Weights = wts_sub_fifos[ + wts_sub_fifo_names[i][2] + ].acquire(ObjectFifoPort.Consume, 1) + if i == 0: + scale = memref.load(rtp[0][3], [0]) + skipScale = memref.load(rtp[0][3], [1]) + skipConvScale = memref.load(rtp[0][3], [2]) + else: + scale = memref.load(rtp[i][2], [0]) + skipScale = memref.load(rtp[i][2], [1]) + + for _ in for_(tensorInH): + element0ActivactionsIn = act3_fifo_1[ + act3_fifo_names_1[i] + ].acquire(ObjectFifoPort.Consume, 1) + element1ActivactionsIn = act3_fifo_2[ + act3_fifo_names_2[i] + ].acquire(ObjectFifoPort.Consume, 1) + + elementActivactionsOut = conv3_out_fifo[i].acquire( + ObjectFifoPort.Produce, 1 + ) + elementSkipsIn = skip_fifos[skip_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 1 + ) + if i == 0: + call( + conv3_kernels_call[0], + [ + element0ActivactionsIn, + element1ActivactionsIn, + element0Weights, + elementActivactionsOut, + elementSkipsIn, + tensorInW, + tensorInCInit, + tensorInCRest, + tensorInCInit, + scale, + skipScale, + skipConvScale, + ], + ) + else: + call( + conv3_kernels_call[i], + [ + element0ActivactionsIn, + element1ActivactionsIn, + element0Weights, + elementActivactionsOut, + elementSkipsIn, + tensorInW, + tensorInCInit, + tensorInCRest, + scale, + skipScale, + ], + ) + objectfifo_release( + ObjectFifoPort.Consume, act3_fifo_names_1[i], 1 + ) + objectfifo_release( + ObjectFifoPort.Consume, act3_fifo_names_2[i], 1 + ) + objectfifo_release( + ObjectFifoPort.Produce, conv3_out_fifo_names[i], 1 + ) + + objectfifo_release( + ObjectFifoPort.Consume, skip_fifo_names[i], 1 + ) + yield_([]) + objectfifo_release( + ObjectFifoPort.Consume, wts_sub_fifo_names[i][2], 1 + ) + yield_([]) + + # instruction stream generation + activationsInSize32b = (tensorInW * tensorInH * tensorInCInit) // 4 + acitivationsOutSize32b = (tensorInW * tensorInH * tensorInCRest) // 4 + + totalWeightsSize32b_init = ( + tensorInCInit * tensorInCInit + + 3 * 3 * tensorInCInit * tensorInCInit + + 2 * tensorInCInit * tensorInCRest + ) // 4 + + totalWeightsSize32b_rest = ( + tensorInCInit * tensorInCRest + + 3 * 3 * tensorInCInit * tensorInCInit + + tensorInCInit * tensorInCRest + ) // 4 + + totalWeightsSize32b_complete = ( + totalWeightsSize32b_init + repeat * totalWeightsSize32b_rest + ) + + activationsInL3_ty = MemRefType.get((activationsInSize32b,), int32_ty) + activationsOutL3_ty = MemRefType.get((acitivationsOutSize32b,), int32_ty) + weightsInL3_ty_init = MemRefType.get((totalWeightsSize32b_init,), int32_ty) + weightsInL3_ty_rest = MemRefType.get((totalWeightsSize32b_rest,), int32_ty) + + weightsInL3_ty_complete = MemRefType.get( + (totalWeightsSize32b_complete,), int32_ty + ) + + @FuncOp.from_py_func( + activationsInL3_ty, weightsInL3_ty_complete, activationsOutL3_ty + ) + def sequence(inputFromL3, weightsFromL3, outputToL3): + + # for c, col in enumerate(rtp_name): + # for r, row in enumerate(col): + # NpuWriteRTPOp(row, col=c, row=r + 2, index=0, value=1) # scale + + # NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=1, value=0) + # NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=2, value=1) + + # NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=1, value=0) + + # NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=1, value=0) + + # # # write RTP parameters + # npuWriteRTPOp( + # "rtpComputeTile02", col=0, row=2, index=0, value=1 + # ) # scale + # npuWriteRTPOp( + # "rtpComputeTile03", col=0, row=3, index=0, value=1 + # ) # scale + # npuWriteRTPOp( + # "rtpComputeTile05", col=0, row=5, index=0, value=1 + # ) # scale + # npuWriteRTPOp( + # "rtpComputeTile04", col=0, row=4, index=0, value=1 + # ) # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input + # npuWriteRTPOp( + # "rtpComputeTile04", col=0, row=4, index=1, value=0 + # ) # skip_scale + + npu_dma_memcpy_nd( + metadata="act1_00_02_01", + bd_id=0, + mem=inputFromL3, + sizes=[1, 1, 1, activationsInSize32b], + ) + npu_dma_memcpy_nd( + metadata="outOFL2L3", + bd_id=2, + mem=outputToL3, + sizes=[1, 1, 1, acitivationsOutSize32b], + ) + npu_dma_memcpy_nd( + metadata="wts_0_L3L2", + bd_id=1, + mem=weightsFromL3, + sizes=[1, 1, 1, totalWeightsSize32b_init], + ) + + npu_dma_memcpy_nd( + metadata="wts_1_L3L2", + bd_id=1, + mem=weightsFromL3, + offsets=[0, 0, 0, totalWeightsSize32b_init], + sizes=[1, 1, 1, totalWeightsSize32b_rest], + ) + + npu_dma_memcpy_nd( + metadata="wts_2_L3L2", + bd_id=1, + mem=weightsFromL3, + offsets=[ + 0, + 0, + 0, + totalWeightsSize32b_init + totalWeightsSize32b_rest, + ], + sizes=[1, 1, 1, totalWeightsSize32b_rest], + ) + + npu_sync(column=1, row=0, direction=0, channel=0) + + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) + + +resnet_conv_x() diff --git a/programming_examples/ml/resnet/ptq_conv2x/test.py b/programming_examples/ml/resnet/ptq_conv2x/test.py new file mode 100755 index 0000000000..06989d55fa --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/test.py @@ -0,0 +1,489 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. + +import torch +import torch.nn as nn +import sys +import math +from aie.utils.ml import DataShaper +import time +import os +import numpy as np +from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute +import aie.utils.test as test_utils + +torch.use_deterministic_algorithms(True) +torch.manual_seed(0) + + +def main(opts): + design = "resnet_conv2_x_int8" + xclbin_path = opts.xclbin + insts_path = opts.instr + + log_folder = "log/" + if not os.path.exists(log_folder): + os.makedirs(log_folder) + + num_iter = 1 + npu_time_total = 0 + npu_time_min = 9999999 + npu_time_max = 0 + trace_size = 16384 + enable_trace = False + trace_file = "log/trace_" + design + ".txt" + # ------------------------------------------------------ + # Configure this to match your design's buffer size + # ------------------------------------------------------ + dtype_in = np.dtype("int8") + dtype_wts = np.dtype("int8") + dtype_out = np.dtype("uint8") + + shape_in_act = (32, 8, 32, 8) + shape_total_wts = (212992, 1) + shape_out = (32, 32, 32, 8) + + # ------------------------------------------------------ + # Initialize activation, weights, scaling factor for int8 model + # ------------------------------------------------------ + int_inp = torch.randint(1, 10, (1, 64, 32, 32)).type(torch.FloatTensor) + block_0_int_weight_1 = torch.randint(10, 20, (64, 64, 1, 1)).type(torch.FloatTensor) + block_0_int_weight_2 = torch.randint(10, 20, (64, 64, 3, 3)).type(torch.FloatTensor) + block_0_int_weight_3 = torch.randint(10, 20, (256, 64, 1, 1)).type( + torch.FloatTensor + ) + block_0_int_weight_skip = torch.randint(10, 20, (256, 64, 1, 1)).type( + torch.FloatTensor + ) + + block_1_int_weight_1 = torch.randint(20, 30, (64, 256, 1, 1)).type( + torch.FloatTensor + ) + block_1_int_weight_2 = torch.randint(20, 30, (64, 64, 3, 3)).type(torch.FloatTensor) + block_1_int_weight_3 = torch.randint(20, 30, (256, 64, 1, 1)).type( + torch.FloatTensor + ) + + block_2_int_weight_1 = torch.randint(30, 40, (64, 256, 1, 1)).type( + torch.FloatTensor + ) + block_2_int_weight_2 = torch.randint(30, 40, (64, 64, 3, 3)).type(torch.FloatTensor) + block_2_int_weight_3 = torch.randint(30, 40, (256, 64, 1, 1)).type( + torch.FloatTensor + ) + + init_scale = 0.5 + block_0_relu_1 = 0.5 + block_0_relu_2 = 0.5 + block_0_relu_3 = 0.5 + + block_0_weight_scale1 = 0.5 + block_0_weight_scale2 = 0.5 + block_0_weight_scale3 = 0.5 + block_0_weight_scale_skip = 0.5 + + block_1_relu_1 = 0.5 + block_1_relu_2 = 0.5 + block_1_relu_3 = 0.5 + + block_1_weight_scale1 = 0.5 + block_1_weight_scale2 = 0.5 + block_1_weight_scale3 = 0.5 + block_1_quant_add_1 = 0.5 + + block_2_relu_1 = 0.5 + block_2_relu_2 = 0.5 + block_2_relu_3 = 0.5 + + block_2_weight_scale1 = 0.5 + block_2_weight_scale2 = 0.5 + block_2_weight_scale3 = 0.5 + block_2_quant_add_1 = 0.5 + + block_0_combined_scale1 = -math.log2( + init_scale * block_0_weight_scale1 / block_0_relu_1 + ) # RHS after first conv1x1 | clip 0-->255 + block_0_combined_scale2 = -math.log2( + block_0_relu_1 * block_0_weight_scale2 / block_0_relu_2 + ) # RHS after second conv3x3 | clip 0-->255 + block_0_combined_scale3 = -math.log2( + block_0_relu_2 * block_0_weight_scale3 / init_scale + ) # RHS after third conv1x1 | clip -128-->+127 + block_0_combined_scale_skip = -math.log2( + init_scale * block_0_weight_scale_skip / init_scale + ) # LHS after conv1x1 | clip -128-->+127 + block_0_combined_scale4 = -math.log2( + init_scale / block_0_relu_3 + ) # After addition | clip 0-->255 + + block_1_combined_scale1 = -math.log2( + block_0_relu_3 * block_1_weight_scale1 / block_1_relu_1 + ) # RHS after first conv1x1 | clip 0-->255 + block_1_combined_scale2 = -math.log2( + block_1_relu_1 * block_1_weight_scale2 / block_1_relu_2 + ) # RHS after second conv3x3 | clip 0-->255 + block_1_combined_scale3 = -math.log2( + block_1_relu_2 * block_1_weight_scale3 / block_1_quant_add_1 + ) # RHS after third conv1x1 | clip -128-->+127 + block_1_combined_scale4 = -math.log2( + block_1_quant_add_1 / block_1_relu_3 + ) # After addition | clip 0-->255 + + block_2_combined_scale1 = -math.log2( + block_1_relu_3 * block_2_weight_scale1 / block_2_relu_1 + ) # RHS after first conv1x1 | clip 0-->255 + block_2_combined_scale2 = -math.log2( + block_2_relu_1 * block_2_weight_scale2 / block_2_relu_2 + ) # RHS after second conv3x3 | clip 0-->255 + block_2_combined_scale3 = -math.log2( + block_2_relu_2 * block_2_weight_scale3 / block_2_quant_add_1 + ) # RHS after third conv1x1 | clip -128-->+127 + block_2_combined_scale4 = -math.log2( + block_2_quant_add_1 / block_2_relu_3 + ) # After addition | clip 0-->255 + + min = 0 + max = 255 + + # ------------------------------------------------------ + # Get device, load the xclbin & kernel and register them + # ------------------------------------------------------ + app = setup_aie( + xclbin_path, + insts_path, + shape_in_act, + dtype_in, + shape_total_wts, + dtype_wts, + shape_out, + dtype_out, + enable_trace=enable_trace, + trace_size=trace_size, + ) + + # ------------------------------------------------------ + # Define your golden reference + # ------------------------------------------------------ + class resnet_conv2_x_int8(nn.Module): + expansion = 4 + + def __init__(self, in_planes=64, planes=64): + super(resnet_conv2_x_int8, self).__init__() + + self.shortcut = nn.Conv2d( + in_planes, self.expansion * planes, kernel_size=1, bias=False + ) + # Bottleneck 0 + self.block_0_conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) + self.block_0_conv2 = nn.Conv2d( + planes, + planes, + kernel_size=3, + padding=1, + padding_mode="zeros", + bias=False, + ) + self.block_0_conv3 = nn.Conv2d( + planes, self.expansion * planes, kernel_size=1, bias=False + ) + + self.block_0_relu1 = nn.ReLU() + self.block_0_relu2 = nn.ReLU() + self.block_0_relu3 = nn.ReLU() + + # Bottleneck 1 + self.block_1_conv1 = nn.Conv2d( + self.expansion * planes, planes, kernel_size=1, bias=False + ) + self.block_1_conv2 = nn.Conv2d( + planes, + planes, + kernel_size=3, + padding=1, + padding_mode="zeros", + bias=False, + ) + self.block_1_conv3 = nn.Conv2d( + planes, self.expansion * planes, kernel_size=1, bias=False + ) + + self.block_1_relu1 = nn.ReLU() + self.block_1_relu2 = nn.ReLU() + self.block_1_relu3 = nn.ReLU() + + # Bottleneck 2 + self.block_2_conv1 = nn.Conv2d( + self.expansion * planes, planes, kernel_size=1, bias=False + ) + self.block_2_conv2 = nn.Conv2d( + planes, + planes, + kernel_size=3, + padding=1, + padding_mode="zeros", + bias=False, + ) + self.block_2_conv3 = nn.Conv2d( + planes, self.expansion * planes, kernel_size=1, bias=False + ) + + self.block_2_relu1 = nn.ReLU() + self.block_2_relu2 = nn.ReLU() + self.block_2_relu3 = nn.ReLU() + + def forward(self, x): + # **************** Bottleneck 0 **************** + block_0_conv1_out = ( + self.block_0_conv1(x) * init_scale * block_0_weight_scale1 + ) + block_0_relu1_out = torch.clamp( + torch.round(self.block_0_relu1(block_0_conv1_out) / block_0_relu_1), + min, + max, + ) # convert to int and apply relu + block_0_conv2_out = ( + self.block_0_conv2(block_0_relu1_out) + * block_0_relu_1 + * block_0_weight_scale2 + ) + block_0_relu2_out = torch.clamp( + torch.round(self.block_0_relu2(block_0_conv2_out) / block_0_relu_2), + min, + max, + ) + block_0_conv3_out = ( + self.block_0_conv3(block_0_relu2_out) + * block_0_relu_2 + * block_0_weight_scale3 + ) + block_0_rhf_same_scale = torch.clamp( + torch.round(block_0_conv3_out / init_scale), -128, 127 + ) + + block_0_lhs_conv = self.shortcut(x) * init_scale * block_0_weight_scale_skip + block_0_lhs_same_scale = torch.clamp( + torch.round(block_0_lhs_conv / init_scale), -128, 127 + ) + # convert to int and apply relu + + block_0_skip_add = init_scale * ( + block_0_rhf_same_scale + block_0_lhs_same_scale + ) + block_0_final_out = torch.clamp( + torch.round(self.block_0_relu3(block_0_skip_add) / block_0_relu_3), + min, + max, + ) + # **************** Bottleneck 1 **************** + block_1_conv1_out = ( + self.block_1_conv1(block_0_final_out) + * block_0_relu_3 + * block_1_weight_scale1 + ) + block_1_relu1_out = torch.clamp( + torch.round(self.block_1_relu1(block_1_conv1_out) / block_1_relu_1), + min, + max, + ) # convert to int and apply relu + block_1_conv2_out = ( + self.block_1_conv2(block_1_relu1_out) + * block_1_relu_1 + * block_1_weight_scale2 + ) + block_1_relu2_out = torch.clamp( + torch.round(self.block_1_relu2(block_1_conv2_out) / block_1_relu_2), + min, + max, + ) + block_1_conv3_out = ( + self.block_1_conv3(block_1_relu2_out) + * block_1_relu_2 + * block_1_weight_scale3 + ) + block_1_rhf_same_scale = torch.clamp( + torch.round(block_1_conv3_out / block_0_relu_3), -128, 127 + ) + + block_1_skip_add = block_0_relu_3 * ( + block_1_rhf_same_scale + block_0_final_out + ) + block_1_final_out = torch.clamp( + torch.round(self.block_1_relu3(block_1_skip_add) / block_1_relu_3), + min, + max, + ) + + # **************** Bottleneck 2 **************** + block_2_conv1_out = ( + self.block_2_conv1(block_1_final_out) + * block_1_relu_3 + * block_2_weight_scale1 + ) + block_2_relu1_out = torch.clamp( + torch.round(self.block_2_relu1(block_2_conv1_out) / block_2_relu_1), + min, + max, + ) # convert to int and apply relu + block_2_conv2_out = ( + self.block_2_conv2(block_2_relu1_out) + * block_2_relu_1 + * block_2_weight_scale2 + ) + block_2_relu2_out = torch.clamp( + torch.round(self.block_2_relu2(block_2_conv2_out) / block_2_relu_2), + min, + max, + ) + block_2_conv3_out = ( + self.block_2_conv3(block_2_relu2_out) + * block_2_relu_2 + * block_2_weight_scale3 + ) + block_2_rhf_same_scale = torch.clamp( + torch.round(block_2_conv3_out / block_1_relu_3), -128, 127 + ) + + block_2_skip_add = block_1_relu_3 * ( + block_2_rhf_same_scale + block_1_final_out + ) + block_2_final_out = block_2_relu_3 * ( + torch.clamp( + torch.round(self.block_2_relu3(block_2_skip_add) / block_2_relu_3), + min, + max, + ) + ) + return block_2_final_out + + # ------------------------------------------------------ + # Pytorch baseline + # ------------------------------------------------------ + model = resnet_conv2_x_int8() + model.eval() + model.block_0_conv1.weight.data.copy_(block_0_int_weight_1) + model.block_0_conv2.weight.data.copy_(block_0_int_weight_2) + model.block_0_conv3.weight.data.copy_(block_0_int_weight_3) + model.shortcut.weight.data.copy_(block_0_int_weight_skip) + + model.block_1_conv1.weight.data.copy_(block_1_int_weight_1) + model.block_1_conv2.weight.data.copy_(block_1_int_weight_2) + model.block_1_conv3.weight.data.copy_(block_1_int_weight_3) + + model.block_2_conv1.weight.data.copy_(block_2_int_weight_1) + model.block_2_conv2.weight.data.copy_(block_2_int_weight_2) + model.block_2_conv3.weight.data.copy_(block_2_int_weight_3) + + golden_output = model(int_inp) + + # ------------------------------------------------------ + # Reorder input data-layout + # ------------------------------------------------------ + ds = DataShaper() + before_input = int_inp.squeeze().data.numpy().astype(dtype_in) + before_input.tofile( + log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d" + ) + ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") + ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + + block0_wts1 = ds.reorder_mat( + block_0_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block0_wts2 = ds.reorder_mat( + block_0_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block0_wts3 = ds.reorder_mat( + block_0_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block0_wts_skip = ds.reorder_mat( + block_0_int_weight_skip.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + + total_wts = np.concatenate( + (block0_wts1, block0_wts2, block0_wts3, block0_wts_skip), axis=None + ) + + block1_wts1 = ds.reorder_mat( + block_1_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block1_wts2 = ds.reorder_mat( + block_1_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block1_wts3 = ds.reorder_mat( + block_1_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + + total_wts2 = np.concatenate( + (total_wts, block1_wts1, block1_wts2, block1_wts3), axis=None + ) + + block2_wts1 = ds.reorder_mat( + block_2_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block2_wts2 = ds.reorder_mat( + block_2_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block2_wts3 = ds.reorder_mat( + block_2_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + + total_wts3 = np.concatenate( + (total_wts2, block2_wts1, block2_wts2, block2_wts3), axis=None + ) + + total_wts3.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") + + # ------------------------------------------------------ + # Main run loop + # ------------------------------------------------------ + for i in range(num_iter): + start = time.time_ns() + aie_output = execute(app, ifm_mem_fmt, total_wts3) * block_2_relu_3 + stop = time.time_ns() + + if enable_trace: + aie_output, trace = extract_trace( + aie_output, shape_out, dtype_out, trace_size + ) + write_out_trace(trace, trace_file) + + npu_time = stop - start + npu_time_total = npu_time_total + npu_time + + # ------------------------------------------------------ + # Reorder output data-layout + # ------------------------------------------------------ + temp_out = aie_output.reshape(32, 32, 32, 8) + temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") + ofm_mem_fmt = temp_out.reshape(256, 32, 32) + ofm_mem_fmt.tofile( + log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d" + ) + ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) + + # ------------------------------------------------------ + # Compare the AIE output and the golden reference + # ------------------------------------------------------ + print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) + + if np.allclose( + ofm_mem_fmt_out.detach().numpy(), + golden_output.detach().numpy(), + rtol=0, + atol=block_2_relu_3, + ): + print("\nPASS!\n") + exit(0) + else: + print("\nFailed.\n") + exit(-1) + + +if __name__ == "__main__": + p = test_utils.create_default_argparser() + opts = p.parse_args(sys.argv[1:]) + main(opts) From dacdc6aa97843ee7e286aaeeba116bfed99f1859 Mon Sep 17 00:00:00 2001 From: Gagandeep Singh Date: Tue, 30 Apr 2024 10:22:59 -0600 Subject: [PATCH 2/8] ptq for offloaded resnet conv2x --- .../ml/resnet/ptq_conv2x/aie2.py | 56 +- .../ptq_conv2x/data/cifar10_label_map.txt | 1 + .../ml/resnet/ptq_conv2x/model.py | 151 ++++ .../ml/resnet/ptq_conv2x/requirements.txt | 4 + .../ml/resnet/ptq_conv2x/run_makefile.lit | 9 + .../ml/resnet/ptq_conv2x/test.py | 671 +++++++++--------- .../ml/resnet/ptq_conv2x/utils.py | 40 ++ 7 files changed, 560 insertions(+), 372 deletions(-) create mode 100644 programming_examples/ml/resnet/ptq_conv2x/data/cifar10_label_map.txt create mode 100644 programming_examples/ml/resnet/ptq_conv2x/model.py create mode 100644 programming_examples/ml/resnet/ptq_conv2x/requirements.txt create mode 100644 programming_examples/ml/resnet/ptq_conv2x/run_makefile.lit create mode 100644 programming_examples/ml/resnet/ptq_conv2x/utils.py diff --git a/programming_examples/ml/resnet/ptq_conv2x/aie2.py b/programming_examples/ml/resnet/ptq_conv2x/aie2.py index 27710c2c71..6992f3da81 100755 --- a/programming_examples/ml/resnet/ptq_conv2x/aie2.py +++ b/programming_examples/ml/resnet/ptq_conv2x/aie2.py @@ -580,7 +580,10 @@ def core_body(): @core(cores[i][1], "conv2dk3.o") def core_body(): - scale = 11 + if(i==2): + scale = 9 + else: + scale = 9 for _ in for_(sys.maxsize): # acquire weights and rtps once @@ -697,7 +700,10 @@ def core_body(): @core(cores[i][3], "conv2dk3.o") def core_body(): - scale = 11 + if(i==2): + scale = 9 + else: + scale = 9 for _ in for_(sys.maxsize): # acquire weights and rtps once @@ -930,30 +936,28 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): # for c, col in enumerate(rtp_name): # for r, row in enumerate(col): # NpuWriteRTPOp(row, col=c, row=r + 2, index=0, value=1) # scale - - # NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=1, value=0) - # NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=2, value=1) - - # NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=1, value=0) - - # NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=1, value=0) - - # # # write RTP parameters - # npuWriteRTPOp( - # "rtpComputeTile02", col=0, row=2, index=0, value=1 - # ) # scale - # npuWriteRTPOp( - # "rtpComputeTile03", col=0, row=3, index=0, value=1 - # ) # scale - # npuWriteRTPOp( - # "rtpComputeTile05", col=0, row=5, index=0, value=1 - # ) # scale - # npuWriteRTPOp( - # "rtpComputeTile04", col=0, row=4, index=0, value=1 - # ) # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input - # npuWriteRTPOp( - # "rtpComputeTile04", col=0, row=4, index=1, value=0 - # ) # skip_scale + NpuWriteRTPOp("rtpComputeTile02", col=0, row=2, index=0, value=8) + NpuWriteRTPOp("rtpComputeTile03", col=0, row=3, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile04", col=0, row=5, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=0, value=11) + NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=1, value=0) + NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=2, value=7) + + NpuWriteRTPOp("rtpComputeTile15", col=1, row=5, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile14", col=1, row=4, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile12", col=1, row=2, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=0, value=12) + NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=1, value=0) + + NpuWriteRTPOp("rtpComputeTile22", col=2, row=2, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile23", col=2, row=3, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile25", col=2, row=5, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=0, value=12) + NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=1, value=0) + + rtp_1=[7,10,13,-2,10] + rtp_2=[8,10,12] + rtp_3=[9,9,12] npu_dma_memcpy_nd( metadata="act1_00_02_01", diff --git a/programming_examples/ml/resnet/ptq_conv2x/data/cifar10_label_map.txt b/programming_examples/ml/resnet/ptq_conv2x/data/cifar10_label_map.txt new file mode 100644 index 0000000000..1fc508024c --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/data/cifar10_label_map.txt @@ -0,0 +1 @@ +{"0": "airplane", "1": "automobile", "2": "bird", "3": "cat", "4": "deer", "5": "dog", "6": "frog", "7": "horse", "8": "ship", "9": "truck"} \ No newline at end of file diff --git a/programming_examples/ml/resnet/ptq_conv2x/model.py b/programming_examples/ml/resnet/ptq_conv2x/model.py new file mode 100644 index 0000000000..68c6feaa8b --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/model.py @@ -0,0 +1,151 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class CombinedModel(nn.Module): + def __init__(self, first, aie, post): + super(CombinedModel, self).__init__() + self.first = first + self.aie = aie + self.post = post + + def forward(self, x): + x = self.first(x) + x = self.aie(x) + x = self.post(x) + return x + +class PreAIELayers(nn.Module): + def __init__(self): + super(PreAIELayers, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64) + + def forward(self, x): + out = self.conv1(x) + out = self.bn1(out) + # print( out) + out = F.relu(out) + return out + + +class AIEConv2xOffload(nn.Module): + def __init__(self, block, num_blocks): + super(AIEConv2xOffload, self).__init__() + self.in_planes = 64 + self.layer1 = block(in_planes=64, planes=64) + self.layer2 = block(in_planes=256, planes=64) + self.layer3 = block(in_planes=256, planes=64) + + def forward(self, x): + out = self.layer1(x) + out = self.layer2(out) + out = self.layer3(out) + return out + + +class PostAIELayers(nn.Module): + def __init__(self, block, num_blocks, num_classes): + super(PostAIELayers, self).__init__() + + self.in_planes = 256 + self.layer2 = self._make_layer(block, 128, num_blocks[0], stride=2) + self.layer3 = self._make_layer(block, 256, num_blocks[1], stride=2) + self.layer4 = self._make_layer(block, 512, num_blocks[2], stride=2) + self.linear = nn.Linear(512 * block.expansion, num_classes) + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1] * (num_blocks - 1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x): + out = self.layer2(x) + out = self.layer3(out) + out = self.layer4(out) + out = F.avg_pool2d(out, 32) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out +class Bottleneck_projected(nn.Module): + expansion = 4 + + def __init__(self, in_planes, planes, stride=1, option="A"): + super(Bottleneck_projected, self).__init__() + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d( + planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False + ) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d( + planes, self.expansion * planes, kernel_size=1, bias=False + ) + self.bn3 = nn.BatchNorm2d(self.expansion * planes) + self.relu1 = nn.ReLU() + self.relu2 = nn.ReLU() + self.relu3 = nn.ReLU() + + self.shortcut = nn.Sequential() + if in_planes != self.expansion * planes: + self.shortcut = nn.Sequential( + nn.Conv2d( + in_planes, self.expansion * planes, kernel_size=1, bias=False + ), + nn.BatchNorm2d(self.expansion * planes), + ) + def forward(self, x): + out = self.relu1(self.bn1(self.conv1(x))) + out = self.relu2(self.bn2(self.conv2(out))) + out = self.bn3(self.conv3(out)) + out = out + self.shortcut(x) + out = self.relu3(out) + return out + +class Bottleneck_fused_projected(nn.Module): + expansion = 4 + + def __init__(self, in_planes, planes, stride=1, option="A"): + super(Bottleneck_fused_projected, self).__init__() + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) + + self.conv2 = nn.Conv2d( + planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False + ) + + self.conv3 = nn.Conv2d( + planes, self.expansion * planes, kernel_size=1, bias=False + ) + + self.relu1 = nn.ReLU() + self.relu2 = nn.ReLU() + self.relu3 = nn.ReLU() + + self.shortcut = nn.Sequential() + if in_planes != self.expansion * planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, bias=False) + ) + + def forward(self, x): + out = self.relu1((self.conv1(x))) + out = self.relu2((self.conv2(out))) + out = self.conv3(out) + out += self.shortcut(x) + out = self.relu3(out) + return out + +def Resnet50_conv2x_offload(num_classes): + return CombinedModel( + PreAIELayers(), + AIEConv2xOffload( + Bottleneck_fused_projected, + [ + 1, + ], + ), + PostAIELayers(Bottleneck_projected, [4, 6, 3], num_classes), + ) \ No newline at end of file diff --git a/programming_examples/ml/resnet/ptq_conv2x/requirements.txt b/programming_examples/ml/resnet/ptq_conv2x/requirements.txt new file mode 100644 index 0000000000..47a9883564 --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/requirements.txt @@ -0,0 +1,4 @@ +brevitas +torchvision +tqdm +opencv-python \ No newline at end of file diff --git a/programming_examples/ml/resnet/ptq_conv2x/run_makefile.lit b/programming_examples/ml/resnet/ptq_conv2x/run_makefile.lit new file mode 100644 index 0000000000..6097345491 --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess, torch +// +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile +// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s +// CHECK: PASS! diff --git a/programming_examples/ml/resnet/ptq_conv2x/test.py b/programming_examples/ml/resnet/ptq_conv2x/test.py index 06989d55fa..8bf5857bd6 100755 --- a/programming_examples/ml/resnet/ptq_conv2x/test.py +++ b/programming_examples/ml/resnet/ptq_conv2x/test.py @@ -13,12 +13,31 @@ import time import os import numpy as np +import model as res + from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute import aie.utils.test as test_utils torch.use_deterministic_algorithms(True) torch.manual_seed(0) - +from utils import unpickle,load_class_label +import torchvision +from torchvision import transforms +from PIL import Image +from brevitas.nn import QuantConv2d, QuantIdentity, QuantReLU +from brevitas.quant.fixed_point import ( + Int8ActPerTensorFixedPoint, + Int8WeightPerTensorFixedPoint, + Uint8ActPerTensorFixedPoint, +) +from brevitas.graph.target.flexml import preprocess_for_flexml_quantize +from brevitas_examples.imagenet_classification.ptq.ptq_common import quantize_model +import torch.utils.data as data_utils +from brevitas_examples.imagenet_classification.ptq.ptq_common import calibrate +from brevitas_examples.imagenet_classification.ptq.ptq_common import calibrate_bn +from brevitas_examples.imagenet_classification.utils import generate_dataloader +from brevitas_examples.imagenet_classification.utils import SEED +from brevitas_examples.imagenet_classification.utils import validate def main(opts): design = "resnet_conv2_x_int8" @@ -48,108 +67,235 @@ def main(opts): shape_out = (32, 32, 32, 8) # ------------------------------------------------------ - # Initialize activation, weights, scaling factor for int8 model + # Post training quantization to get int8 weights and activation for AIE # ------------------------------------------------------ - int_inp = torch.randint(1, 10, (1, 64, 32, 32)).type(torch.FloatTensor) - block_0_int_weight_1 = torch.randint(10, 20, (64, 64, 1, 1)).type(torch.FloatTensor) - block_0_int_weight_2 = torch.randint(10, 20, (64, 64, 3, 3)).type(torch.FloatTensor) - block_0_int_weight_3 = torch.randint(10, 20, (256, 64, 1, 1)).type( - torch.FloatTensor - ) - block_0_int_weight_skip = torch.randint(10, 20, (256, 64, 1, 1)).type( - torch.FloatTensor + num_classes = 10 + model = res.Resnet50_conv2x_offload(num_classes) + weights = "trained_resnet50/weight.tar" #trained FP model + saved_model_dict = torch.load(weights, map_location=torch.device("cpu")) + model.load_state_dict(saved_model_dict) + + data_dir = "data" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + transform = transforms.Compose( + [ + transforms.Pad(4), + transforms.RandomHorizontalFlip(), + transforms.RandomCrop(32), + transforms.ToTensor(), + ] ) - - block_1_int_weight_1 = torch.randint(20, 30, (64, 256, 1, 1)).type( - torch.FloatTensor + transform_train = transforms.Compose( + [ + transforms.RandomHorizontalFlip(), + transforms.RandomCrop(32, padding=4), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ] ) - block_1_int_weight_2 = torch.randint(20, 30, (64, 64, 3, 3)).type(torch.FloatTensor) - block_1_int_weight_3 = torch.randint(20, 30, (256, 64, 1, 1)).type( - torch.FloatTensor + transform_test = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ] ) - block_2_int_weight_1 = torch.randint(30, 40, (64, 256, 1, 1)).type( - torch.FloatTensor + # CIFAR-10 dataset + train_dataset = torchvision.datasets.CIFAR10( + root=data_dir, train=True, transform=transform_train, download=True ) - block_2_int_weight_2 = torch.randint(30, 40, (64, 64, 3, 3)).type(torch.FloatTensor) - block_2_int_weight_3 = torch.randint(30, 40, (256, 64, 1, 1)).type( - torch.FloatTensor + test_dataset = torchvision.datasets.CIFAR10( + root=data_dir, train=False, transform=transform_test, download=True ) - init_scale = 0.5 - block_0_relu_1 = 0.5 - block_0_relu_2 = 0.5 - block_0_relu_3 = 0.5 - - block_0_weight_scale1 = 0.5 - block_0_weight_scale2 = 0.5 - block_0_weight_scale3 = 0.5 - block_0_weight_scale_skip = 0.5 - - block_1_relu_1 = 0.5 - block_1_relu_2 = 0.5 - block_1_relu_3 = 0.5 - - block_1_weight_scale1 = 0.5 - block_1_weight_scale2 = 0.5 - block_1_weight_scale3 = 0.5 - block_1_quant_add_1 = 0.5 - - block_2_relu_1 = 0.5 - block_2_relu_2 = 0.5 - block_2_relu_3 = 0.5 - - block_2_weight_scale1 = 0.5 - block_2_weight_scale2 = 0.5 - block_2_weight_scale3 = 0.5 - block_2_quant_add_1 = 0.5 - - block_0_combined_scale1 = -math.log2( - init_scale * block_0_weight_scale1 / block_0_relu_1 - ) # RHS after first conv1x1 | clip 0-->255 - block_0_combined_scale2 = -math.log2( - block_0_relu_1 * block_0_weight_scale2 / block_0_relu_2 - ) # RHS after second conv3x3 | clip 0-->255 - block_0_combined_scale3 = -math.log2( - block_0_relu_2 * block_0_weight_scale3 / init_scale - ) # RHS after third conv1x1 | clip -128-->+127 - block_0_combined_scale_skip = -math.log2( - init_scale * block_0_weight_scale_skip / init_scale - ) # LHS after conv1x1 | clip -128-->+127 - block_0_combined_scale4 = -math.log2( - init_scale / block_0_relu_3 - ) # After addition | clip 0-->255 + # Data loader + indices = torch.arange(256) + tr_sub = data_utils.Subset(train_dataset, indices) + val_sub = data_utils.Subset(test_dataset, indices) + calib_loader = torch.utils.data.DataLoader(dataset=tr_sub, batch_size=64, shuffle=True) + val_loader = torch.utils.data.DataLoader(dataset=val_sub, batch_size=64, shuffle=False) + img_shape = 32 + model_aie = preprocess_for_flexml_quantize( + model.aie, + torch.ones(1, 64, img_shape, img_shape), + equalize_iters=1000, + equalize_merge_bias=True, + merge_bn=True, + ) - block_1_combined_scale1 = -math.log2( - block_0_relu_3 * block_1_weight_scale1 / block_1_relu_1 - ) # RHS after first conv1x1 | clip 0-->255 - block_1_combined_scale2 = -math.log2( - block_1_relu_1 * block_1_weight_scale2 / block_1_relu_2 - ) # RHS after second conv3x3 | clip 0-->255 - block_1_combined_scale3 = -math.log2( - block_1_relu_2 * block_1_weight_scale3 / block_1_quant_add_1 - ) # RHS after third conv1x1 | clip -128-->+127 - block_1_combined_scale4 = -math.log2( - block_1_quant_add_1 / block_1_relu_3 - ) # After addition | clip 0-->255 + quant_model = quantize_model( + model_aie, + backend="flexml", + scale_factor_type="po2_scale", + bias_bit_width=32, + weight_bit_width=8, + weight_narrow_range=False, + weight_param_method="stats", + weight_quant_granularity="per_tensor", + weight_quant_type="sym", + layerwise_first_last_bit_width=8, + act_bit_width=8, + act_param_method="stats", + act_quant_percentile=99.999, + act_quant_type="sym", + quant_format="int", + layerwise_first_last_mantissa_bit_width=4, + layerwise_first_last_exponent_bit_width=3, + weight_mantissa_bit_width=4, + weight_exponent_bit_width=3, + act_mantissa_bit_width=4, + act_exponent_bit_width=3, + ) - block_2_combined_scale1 = -math.log2( - block_1_relu_3 * block_2_weight_scale1 / block_2_relu_1 + model.aie = quant_model + model.eval() + print("Starting post training quantization:") + calibrate(calib_loader, model) + model.eval() + device, dtype = ( + next(model.parameters()).device, + next(model.parameters()).dtype, + ) + # ----------------------- + + + from numpy import load + + params = {} + weights = {} + for name, module in model.named_modules(): + if isinstance(module, QuantConv2d): + # print(name) + # print(module.quant_weight().scale) + weights[name + ".int_weight"] = module.quant_weight().int(float_datatype=False) + params[name + "_scale"] = module.quant_weight().scale.detach().numpy() + if isinstance(module, QuantIdentity): + # print(name) + # print(module.quant_act_scale()) + params[name + "_scale"] = module.quant_act_scale() + if isinstance(module, QuantReLU): + # print(name) + # print(module.quant_act_scale()) + params[name + "_scale"] = module.quant_act_scale() + np.savez(os.path.join(os.getcwd(), "int_weights.npz"), **weights) + np.savez(os.path.join(os.getcwd(), "int_conv_scale.npz"), **params) + int_wts_data = load("int_weights.npz", allow_pickle=True) + int_scale_data = load("int_conv_scale.npz", allow_pickle=True) + + int_wts_data_lst = int_wts_data.files + block_0_int_weight_1 = torch.from_numpy(int_wts_data["aie.layer1.conv1.int_weight"]) + block_0_int_weight_2 = torch.from_numpy(int_wts_data["aie.layer1.conv2.int_weight"]) + block_0_int_weight_3 = torch.from_numpy(int_wts_data["aie.layer1.conv3.int_weight"]) + block_0_int_weight_skip = torch.from_numpy(int_wts_data["aie.layer1.shortcut.0.int_weight"]) + + block_1_int_weight_1 = torch.from_numpy(int_wts_data["aie.layer2.conv1.int_weight"]) + block_1_int_weight_2 = torch.from_numpy(int_wts_data["aie.layer2.conv2.int_weight"]) + block_1_int_weight_3 = torch.from_numpy(int_wts_data["aie.layer2.conv3.int_weight"]) + + block_2_int_weight_1 = torch.from_numpy(int_wts_data["aie.layer3.conv1.int_weight"]) + block_2_int_weight_2 = torch.from_numpy(int_wts_data["aie.layer3.conv2.int_weight"]) + block_2_int_weight_3 = torch.from_numpy(int_wts_data["aie.layer3.conv3.int_weight"]) + + int_scale_data_lst = int_scale_data.files + + init_scale = int_scale_data["aie.x_quant_scale"] + block_0_relu_1 = int_scale_data["aie.layer1.relu1_scale"] + block_0_relu_2 = int_scale_data["aie.layer1.relu2_scale"] + block_0_relu_3 = int_scale_data["aie.layer1.relu3_scale"] + block_0_add_scale = int_scale_data["aie.add_quant_scale"] + + block_0_weight_scale_1 = int_scale_data["aie.layer1.conv1_scale"] + block_0_weight_scale_2 = int_scale_data["aie.layer1.conv2_scale"] + block_0_weight_scale_3 = int_scale_data["aie.layer1.conv3_scale"] + block_0_weight_scale_skip = int_scale_data["aie.layer1.shortcut.0_scale"] + + block_1_relu_1 = int_scale_data["aie.layer2.relu1_scale"] + block_1_relu_2 = int_scale_data["aie.layer2.relu2_scale"] + block_1_relu_3 = int_scale_data["aie.layer2.relu3_scale"] + block_1_add_scale = int_scale_data["aie.add_1_quant_scale"] + + block_1_weight_scale_1 = int_scale_data["aie.layer2.conv1_scale"] + block_1_weight_scale_2 = int_scale_data["aie.layer2.conv2_scale"] + block_1_weight_scale_3 = int_scale_data["aie.layer2.conv3_scale"] + + block_2_relu_1 = int_scale_data["aie.layer3.relu1_scale"] + block_2_relu_2 = int_scale_data["aie.layer3.relu2_scale"] + block_2_relu_3 = int_scale_data["aie.layer3.relu3_scale"] + block_2_add_scale = int_scale_data["aie.add_2_quant_scale"] + + block_2_weight_scale_1 = int_scale_data["aie.layer3.conv1_scale"] + block_2_weight_scale_2 = int_scale_data["aie.layer3.conv2_scale"] + block_2_weight_scale_3 = int_scale_data["aie.layer3.conv3_scale"] + + for name, param in model.named_parameters(): + if name.endswith(".bias"): + param.data.fill_(0) + + block_0_combined_scale1 = -math.log( + init_scale * block_0_weight_scale_1 / block_0_relu_1, 2 + ) # after conv1x1 + block_0_combined_scale2 = -math.log( + block_0_relu_1 * block_0_weight_scale_2 / block_0_relu_2, 2 + ) # after conv3x3 + block_0_combined_scale3 = -math.log( + block_0_relu_2 * block_0_weight_scale_3 / block_0_add_scale, 2 + ) # after conv1x1 + block_0_combined_scale4 = -math.log( + block_0_add_scale / block_0_relu_3, 2 + ) # after skip addition using init scale + # combined_scale4=-math.log(inp_scale1/inp_scale4) + block_0_combined_scale_skip = -math.log( + init_scale * block_0_weight_scale_skip / block_0_add_scale, 2 + ) # after LHS conv1x1 + + block_1_combined_scale1 = -math.log( + block_0_relu_3 * block_1_weight_scale_1 / block_1_relu_1, 2 + ) # after conv1x1 + block_1_combined_scale2 = -math.log( + block_1_relu_1 * block_1_weight_scale_2 / block_1_relu_2, 2 + ) # after conv3x3 + block_1_combined_scale3 = -math.log( + block_1_relu_2 * block_1_weight_scale_3 / block_1_add_scale, 2 + ) # after conv1x1 + block_1_combined_scale4 = -math.log( + block_1_add_scale / block_1_relu_3, 2 + ) # after skip addition using init scale + + block_2_combined_scale1 = -math.log( + block_1_relu_3 * block_2_weight_scale_1 / block_2_relu_1, 2 ) # RHS after first conv1x1 | clip 0-->255 - block_2_combined_scale2 = -math.log2( - block_2_relu_1 * block_2_weight_scale2 / block_2_relu_2 + block_2_combined_scale2 = -math.log( + block_2_relu_1 * block_2_weight_scale_2 / block_2_relu_2, 2 ) # RHS after second conv3x3 | clip 0-->255 - block_2_combined_scale3 = -math.log2( - block_2_relu_2 * block_2_weight_scale3 / block_2_quant_add_1 + block_2_combined_scale3 = -math.log( + block_2_relu_2 * block_2_weight_scale_3 / block_2_add_scale, 2 ) # RHS after third conv1x1 | clip -128-->+127 - block_2_combined_scale4 = -math.log2( - block_2_quant_add_1 / block_2_relu_3 + block_2_combined_scale4 = -math.log( + block_2_add_scale / block_2_relu_3, 2 ) # After addition | clip 0-->255 - min = 0 - max = 255 - - # ------------------------------------------------------ + print("--------------------------------------------------------------") + print("Block0 combined_scale after first conv1x1:", block_0_combined_scale1) + print("Block0 combined_scale after second conv3x3:", block_0_combined_scale2) + print("Block0 combined_scale after third conv1x1:", block_0_combined_scale3) + print("Block0 combined_scale after adding skip connection:", (block_0_combined_scale4)) + print("Block0 combined_scale after skip conv1x1:", block_0_combined_scale_skip) + + print("--------------------------------------------------------------") + print("Block1 combined_scale after first conv1x1:", block_1_combined_scale1) + print("Block1 combined_scale after second conv3x3:", block_1_combined_scale2) + print("Block1 combined_scale after third conv1x1:", block_1_combined_scale3) + print("Block1 combined_scale after adding skip connection:", (block_1_combined_scale4)) + print("--------------------------------------------------------------") + print("Block2 combined_scale block2 after first conv1x1:", block_2_combined_scale1) + print("Block2 combined_scale block2 after second conv3x3:", block_2_combined_scale2) + print("Block2 combined_scale block2 after third conv1x1:", block_2_combined_scale3) + print( + "Block2 combined_scale block2 after adding skip connection:", + (block_2_combined_scale4), + ) + print("------------------------------------------------------------------") + # ------------------------------------------------------ # Get device, load the xclbin & kernel and register them # ------------------------------------------------------ app = setup_aie( @@ -164,231 +310,10 @@ def main(opts): enable_trace=enable_trace, trace_size=trace_size, ) - - # ------------------------------------------------------ - # Define your golden reference - # ------------------------------------------------------ - class resnet_conv2_x_int8(nn.Module): - expansion = 4 - - def __init__(self, in_planes=64, planes=64): - super(resnet_conv2_x_int8, self).__init__() - - self.shortcut = nn.Conv2d( - in_planes, self.expansion * planes, kernel_size=1, bias=False - ) - # Bottleneck 0 - self.block_0_conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) - self.block_0_conv2 = nn.Conv2d( - planes, - planes, - kernel_size=3, - padding=1, - padding_mode="zeros", - bias=False, - ) - self.block_0_conv3 = nn.Conv2d( - planes, self.expansion * planes, kernel_size=1, bias=False - ) - - self.block_0_relu1 = nn.ReLU() - self.block_0_relu2 = nn.ReLU() - self.block_0_relu3 = nn.ReLU() - - # Bottleneck 1 - self.block_1_conv1 = nn.Conv2d( - self.expansion * planes, planes, kernel_size=1, bias=False - ) - self.block_1_conv2 = nn.Conv2d( - planes, - planes, - kernel_size=3, - padding=1, - padding_mode="zeros", - bias=False, - ) - self.block_1_conv3 = nn.Conv2d( - planes, self.expansion * planes, kernel_size=1, bias=False - ) - - self.block_1_relu1 = nn.ReLU() - self.block_1_relu2 = nn.ReLU() - self.block_1_relu3 = nn.ReLU() - - # Bottleneck 2 - self.block_2_conv1 = nn.Conv2d( - self.expansion * planes, planes, kernel_size=1, bias=False - ) - self.block_2_conv2 = nn.Conv2d( - planes, - planes, - kernel_size=3, - padding=1, - padding_mode="zeros", - bias=False, - ) - self.block_2_conv3 = nn.Conv2d( - planes, self.expansion * planes, kernel_size=1, bias=False - ) - - self.block_2_relu1 = nn.ReLU() - self.block_2_relu2 = nn.ReLU() - self.block_2_relu3 = nn.ReLU() - - def forward(self, x): - # **************** Bottleneck 0 **************** - block_0_conv1_out = ( - self.block_0_conv1(x) * init_scale * block_0_weight_scale1 - ) - block_0_relu1_out = torch.clamp( - torch.round(self.block_0_relu1(block_0_conv1_out) / block_0_relu_1), - min, - max, - ) # convert to int and apply relu - block_0_conv2_out = ( - self.block_0_conv2(block_0_relu1_out) - * block_0_relu_1 - * block_0_weight_scale2 - ) - block_0_relu2_out = torch.clamp( - torch.round(self.block_0_relu2(block_0_conv2_out) / block_0_relu_2), - min, - max, - ) - block_0_conv3_out = ( - self.block_0_conv3(block_0_relu2_out) - * block_0_relu_2 - * block_0_weight_scale3 - ) - block_0_rhf_same_scale = torch.clamp( - torch.round(block_0_conv3_out / init_scale), -128, 127 - ) - - block_0_lhs_conv = self.shortcut(x) * init_scale * block_0_weight_scale_skip - block_0_lhs_same_scale = torch.clamp( - torch.round(block_0_lhs_conv / init_scale), -128, 127 - ) - # convert to int and apply relu - - block_0_skip_add = init_scale * ( - block_0_rhf_same_scale + block_0_lhs_same_scale - ) - block_0_final_out = torch.clamp( - torch.round(self.block_0_relu3(block_0_skip_add) / block_0_relu_3), - min, - max, - ) - # **************** Bottleneck 1 **************** - block_1_conv1_out = ( - self.block_1_conv1(block_0_final_out) - * block_0_relu_3 - * block_1_weight_scale1 - ) - block_1_relu1_out = torch.clamp( - torch.round(self.block_1_relu1(block_1_conv1_out) / block_1_relu_1), - min, - max, - ) # convert to int and apply relu - block_1_conv2_out = ( - self.block_1_conv2(block_1_relu1_out) - * block_1_relu_1 - * block_1_weight_scale2 - ) - block_1_relu2_out = torch.clamp( - torch.round(self.block_1_relu2(block_1_conv2_out) / block_1_relu_2), - min, - max, - ) - block_1_conv3_out = ( - self.block_1_conv3(block_1_relu2_out) - * block_1_relu_2 - * block_1_weight_scale3 - ) - block_1_rhf_same_scale = torch.clamp( - torch.round(block_1_conv3_out / block_0_relu_3), -128, 127 - ) - - block_1_skip_add = block_0_relu_3 * ( - block_1_rhf_same_scale + block_0_final_out - ) - block_1_final_out = torch.clamp( - torch.round(self.block_1_relu3(block_1_skip_add) / block_1_relu_3), - min, - max, - ) - - # **************** Bottleneck 2 **************** - block_2_conv1_out = ( - self.block_2_conv1(block_1_final_out) - * block_1_relu_3 - * block_2_weight_scale1 - ) - block_2_relu1_out = torch.clamp( - torch.round(self.block_2_relu1(block_2_conv1_out) / block_2_relu_1), - min, - max, - ) # convert to int and apply relu - block_2_conv2_out = ( - self.block_2_conv2(block_2_relu1_out) - * block_2_relu_1 - * block_2_weight_scale2 - ) - block_2_relu2_out = torch.clamp( - torch.round(self.block_2_relu2(block_2_conv2_out) / block_2_relu_2), - min, - max, - ) - block_2_conv3_out = ( - self.block_2_conv3(block_2_relu2_out) - * block_2_relu_2 - * block_2_weight_scale3 - ) - block_2_rhf_same_scale = torch.clamp( - torch.round(block_2_conv3_out / block_1_relu_3), -128, 127 - ) - - block_2_skip_add = block_1_relu_3 * ( - block_2_rhf_same_scale + block_1_final_out - ) - block_2_final_out = block_2_relu_3 * ( - torch.clamp( - torch.round(self.block_2_relu3(block_2_skip_add) / block_2_relu_3), - min, - max, - ) - ) - return block_2_final_out - - # ------------------------------------------------------ - # Pytorch baseline - # ------------------------------------------------------ - model = resnet_conv2_x_int8() - model.eval() - model.block_0_conv1.weight.data.copy_(block_0_int_weight_1) - model.block_0_conv2.weight.data.copy_(block_0_int_weight_2) - model.block_0_conv3.weight.data.copy_(block_0_int_weight_3) - model.shortcut.weight.data.copy_(block_0_int_weight_skip) - - model.block_1_conv1.weight.data.copy_(block_1_int_weight_1) - model.block_1_conv2.weight.data.copy_(block_1_int_weight_2) - model.block_1_conv3.weight.data.copy_(block_1_int_weight_3) - - model.block_2_conv1.weight.data.copy_(block_2_int_weight_1) - model.block_2_conv2.weight.data.copy_(block_2_int_weight_2) - model.block_2_conv3.weight.data.copy_(block_2_int_weight_3) - - golden_output = model(int_inp) - # ------------------------------------------------------ # Reorder input data-layout # ------------------------------------------------------ ds = DataShaper() - before_input = int_inp.squeeze().data.numpy().astype(dtype_in) - before_input.tofile( - log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d" - ) - ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") - ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") block0_wts1 = ds.reorder_mat( block_0_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" @@ -437,51 +362,105 @@ def forward(self, x): total_wts3.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") - # ------------------------------------------------------ - # Main run loop - # ------------------------------------------------------ - for i in range(num_iter): - start = time.time_ns() - aie_output = execute(app, ifm_mem_fmt, total_wts3) * block_2_relu_3 - stop = time.time_ns() - - if enable_trace: - aie_output, trace = extract_trace( - aie_output, shape_out, dtype_out, trace_size - ) - write_out_trace(trace, trace_file) + import time + import cv2 + + predicted_label = [None] * 64 + cpu_predicted_label = [None] * 64 + aie_time = [None] * 64 + metafile = r"./data/cifar-10-batches-py/batches.meta" + datafile = r"./data/cifar-10-batches-py/test_batch" + data_batch_1 = unpickle(datafile) + metadata = unpickle(metafile) + images = data_batch_1["data"] + labels = data_batch_1["labels"] + images = np.reshape(images, (10000, 3, 32, 32)) + dirname = "cifar_images" + if not os.path.exists(dirname): + os.mkdir(dirname) + + # Extract and dump first 10 images + for i in range(0, 100): + im = images[i] + im = im.transpose(1, 2, 0) + im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR) + im_name = f"./cifar_images/image_{i}.png" + cv2.imwrite(im_name, im) + + + label_path = "data/cifar10_label_map.txt" + model_num_classes = 10 + class_label_map = load_class_label(label_path, model_num_classes) + quant_id_1 = QuantIdentity( + act_quant=Uint8ActPerTensorFixedPoint, bit_width=8, return_quant_tensor=True + ) + quant_id_1.eval() - npu_time = stop - start - npu_time_total = npu_time_total + npu_time # ------------------------------------------------------ - # Reorder output data-layout + # Main run loop # ------------------------------------------------------ - temp_out = aie_output.reshape(32, 32, 32, 8) - temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") - ofm_mem_fmt = temp_out.reshape(256, 32, 32) - ofm_mem_fmt.tofile( - log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d" - ) - ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) + + for i in range(0, 64): + print("____________________________________IMAGE {}____________________________________________".format(i)) + image_name = f"./cifar_images/image_{i}.png" + img = Image.open(image_name) + input_tensor = transform_test(img) + input_batch = input_tensor.unsqueeze(0) + with torch.no_grad(): + # print(input_batch.shape + start = time.time() * 1000 + output1 = model.first(input_batch) + + # AIE OFFLOAD + qnt_inp = model.aie.x_quant(output1) + int_inp = model.aie.x_quant(output1).int(float_datatype=True) + before_input = int_inp.squeeze().data.numpy().astype(dtype_in) + ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") + start = time.time_ns() + aie_output = execute(app, ifm_mem_fmt, total_wts3) * block_2_relu_3 + stop = time.time_ns() + temp_out = aie_output.reshape(32, 32, 32, 8) + temp2_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") + ofm_mem_fmt = temp2_out.reshape(256, 32, 32) + ofm_mem_fmt = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) + final_output_aie = model.post(ofm_mem_fmt) + + # ------------------------------------------------------------------------------ + # Baseline output for functional correctness + output_golden = model.aie(output1) + max_error = torch.max(torch.abs(ofm_mem_fmt - output_golden)) + # print(max_error) + final_output_base = model.post(output_golden) + predicted_class = np.argmax(final_output_aie) + predicted_label[i] = metadata["label_names"][predicted_class] + cpu_predicted_class = np.argmax(final_output_base) + cpu_predicted_label[i] = metadata["label_names"][cpu_predicted_class] + label = metadata["label_names"][labels[i]] + print( + f" Predicted AIE: {predicted_label[i]}, Predicted CPU: {predicted_label[i]}" + ) + + # Calculate the five categories with the highest classification probability + prediction_class_index = ( + torch.topk(final_output_aie, k=5, sorted=True).indices.squeeze(0).tolist() + ) + golden_prediction_class_index = ( + torch.topk(final_output_base, k=5, sorted=True).indices.squeeze(0).tolist() + ) + npu_time = stop - start + npu_time_total = npu_time_total + npu_time # ------------------------------------------------------ # Compare the AIE output and the golden reference # ------------------------------------------------------ - print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) - - if np.allclose( - ofm_mem_fmt_out.detach().numpy(), - golden_output.detach().numpy(), - rtol=0, - atol=block_2_relu_3, - ): - print("\nPASS!\n") - exit(0) - else: - print("\nFailed.\n") - exit(-1) - + print("\nAvg NPU time: {}us.".format(int((npu_time_total / 64) / 1000))) + for x, y in zip(predicted_label, predicted_label): + if x != y: + print("\nFailed.\n") + exit(-1) + print("\nPASS!\n") + exit(0) if __name__ == "__main__": p = test_utils.create_default_argparser() diff --git a/programming_examples/ml/resnet/ptq_conv2x/utils.py b/programming_examples/ml/resnet/ptq_conv2x/utils.py new file mode 100644 index 0000000000..21a12f45c7 --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/utils.py @@ -0,0 +1,40 @@ +import json +import cv2 +import numpy as np + +def unpickle(file): + import pickle + + with open(file, "rb") as fo: + dict = pickle.load(fo, encoding="latin1") + return dict + + +def load_class_label(class_label_file: str, num_classes: int) -> list: + class_label = json.load(open(class_label_file)) + class_label_list = [class_label[str(i)] for i in range(num_classes)] + + return class_label_list + + +def extract_cifar(): + data_batch_1 = unpickle(datafile) + metadata = unpickle(metafile) + + images = data_batch_1["data"] + labels = data_batch_1["labels"] + images = np.reshape(images, (10000, 3, 32, 32)) + + import os + + dirname = "cifar_images" + if not os.path.exists(dirname): + os.mkdir(dirname) + + # Extract and dump first 10 images + for i in range(0, 100): + im = images[i] + im = im.transpose(1, 2, 0) + im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR) + im_name = f"./cifar_images/image_{i}.png" + cv2.imwrite(im_name, im) \ No newline at end of file From d82a6864a719317ebde17d49b70ecde640ce852d Mon Sep 17 00:00:00 2001 From: Gagandeep Singh Date: Tue, 30 Apr 2024 10:33:20 -0600 Subject: [PATCH 3/8] black fixes for resnet ptq --- .../ml/resnet/ptq_conv2x/aie2.py | 14 ++-- .../ml/resnet/ptq_conv2x/model.py | 13 +++- .../ml/resnet/ptq_conv2x/test.py | 73 ++++++++++++------- .../ml/resnet/ptq_conv2x/utils.py | 3 +- 4 files changed, 65 insertions(+), 38 deletions(-) diff --git a/programming_examples/ml/resnet/ptq_conv2x/aie2.py b/programming_examples/ml/resnet/ptq_conv2x/aie2.py index 6992f3da81..334e1431bc 100755 --- a/programming_examples/ml/resnet/ptq_conv2x/aie2.py +++ b/programming_examples/ml/resnet/ptq_conv2x/aie2.py @@ -580,7 +580,7 @@ def core_body(): @core(cores[i][1], "conv2dk3.o") def core_body(): - if(i==2): + if i == 2: scale = 9 else: scale = 9 @@ -700,7 +700,7 @@ def core_body(): @core(cores[i][3], "conv2dk3.o") def core_body(): - if(i==2): + if i == 2: scale = 9 else: scale = 9 @@ -948,16 +948,16 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): NpuWriteRTPOp("rtpComputeTile12", col=1, row=2, index=0, value=9) NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=0, value=12) NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=1, value=0) - + NpuWriteRTPOp("rtpComputeTile22", col=2, row=2, index=0, value=9) NpuWriteRTPOp("rtpComputeTile23", col=2, row=3, index=0, value=9) - NpuWriteRTPOp("rtpComputeTile25", col=2, row=5, index=0, value=9) + NpuWriteRTPOp("rtpComputeTile25", col=2, row=5, index=0, value=9) NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=0, value=12) NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=1, value=0) - rtp_1=[7,10,13,-2,10] - rtp_2=[8,10,12] - rtp_3=[9,9,12] + rtp_1 = [7, 10, 13, -2, 10] + rtp_2 = [8, 10, 12] + rtp_3 = [9, 9, 12] npu_dma_memcpy_nd( metadata="act1_00_02_01", diff --git a/programming_examples/ml/resnet/ptq_conv2x/model.py b/programming_examples/ml/resnet/ptq_conv2x/model.py index 68c6feaa8b..a980782021 100644 --- a/programming_examples/ml/resnet/ptq_conv2x/model.py +++ b/programming_examples/ml/resnet/ptq_conv2x/model.py @@ -2,6 +2,7 @@ import torch.nn as nn import torch.nn.functional as F + class CombinedModel(nn.Module): def __init__(self, first, aie, post): super(CombinedModel, self).__init__() @@ -15,6 +16,7 @@ def forward(self, x): x = self.post(x) return x + class PreAIELayers(nn.Module): def __init__(self): super(PreAIELayers, self).__init__() @@ -70,6 +72,8 @@ def forward(self, x): out = out.view(out.size(0), -1) out = self.linear(out) return out + + class Bottleneck_projected(nn.Module): expansion = 4 @@ -97,6 +101,7 @@ def __init__(self, in_planes, planes, stride=1, option="A"): ), nn.BatchNorm2d(self.expansion * planes), ) + def forward(self, x): out = self.relu1(self.bn1(self.conv1(x))) out = self.relu2(self.bn2(self.conv2(out))) @@ -105,6 +110,7 @@ def forward(self, x): out = self.relu3(out) return out + class Bottleneck_fused_projected(nn.Module): expansion = 4 @@ -137,7 +143,8 @@ def forward(self, x): out += self.shortcut(x) out = self.relu3(out) return out - + + def Resnet50_conv2x_offload(num_classes): return CombinedModel( PreAIELayers(), @@ -146,6 +153,6 @@ def Resnet50_conv2x_offload(num_classes): [ 1, ], - ), + ), PostAIELayers(Bottleneck_projected, [4, 6, 3], num_classes), - ) \ No newline at end of file + ) diff --git a/programming_examples/ml/resnet/ptq_conv2x/test.py b/programming_examples/ml/resnet/ptq_conv2x/test.py index 8bf5857bd6..e9278132b5 100755 --- a/programming_examples/ml/resnet/ptq_conv2x/test.py +++ b/programming_examples/ml/resnet/ptq_conv2x/test.py @@ -20,7 +20,7 @@ torch.use_deterministic_algorithms(True) torch.manual_seed(0) -from utils import unpickle,load_class_label +from utils import unpickle, load_class_label import torchvision from torchvision import transforms from PIL import Image @@ -39,6 +39,7 @@ from brevitas_examples.imagenet_classification.utils import SEED from brevitas_examples.imagenet_classification.utils import validate + def main(opts): design = "resnet_conv2_x_int8" xclbin_path = opts.xclbin @@ -71,7 +72,7 @@ def main(opts): # ------------------------------------------------------ num_classes = 10 model = res.Resnet50_conv2x_offload(num_classes) - weights = "trained_resnet50/weight.tar" #trained FP model + weights = "trained_resnet50/weight.tar" # trained FP model saved_model_dict = torch.load(weights, map_location=torch.device("cpu")) model.load_state_dict(saved_model_dict) @@ -112,8 +113,12 @@ def main(opts): indices = torch.arange(256) tr_sub = data_utils.Subset(train_dataset, indices) val_sub = data_utils.Subset(test_dataset, indices) - calib_loader = torch.utils.data.DataLoader(dataset=tr_sub, batch_size=64, shuffle=True) - val_loader = torch.utils.data.DataLoader(dataset=val_sub, batch_size=64, shuffle=False) + calib_loader = torch.utils.data.DataLoader( + dataset=tr_sub, batch_size=64, shuffle=True + ) + val_loader = torch.utils.data.DataLoader( + dataset=val_sub, batch_size=64, shuffle=False + ) img_shape = 32 model_aie = preprocess_for_flexml_quantize( model.aie, @@ -153,12 +158,11 @@ def main(opts): calibrate(calib_loader, model) model.eval() device, dtype = ( - next(model.parameters()).device, - next(model.parameters()).dtype, + next(model.parameters()).device, + next(model.parameters()).dtype, ) # ----------------------- - from numpy import load params = {} @@ -167,7 +171,9 @@ def main(opts): if isinstance(module, QuantConv2d): # print(name) # print(module.quant_weight().scale) - weights[name + ".int_weight"] = module.quant_weight().int(float_datatype=False) + weights[name + ".int_weight"] = module.quant_weight().int( + float_datatype=False + ) params[name + "_scale"] = module.quant_weight().scale.detach().numpy() if isinstance(module, QuantIdentity): # print(name) @@ -182,11 +188,13 @@ def main(opts): int_wts_data = load("int_weights.npz", allow_pickle=True) int_scale_data = load("int_conv_scale.npz", allow_pickle=True) - int_wts_data_lst = int_wts_data.files + int_wts_data_lst = int_wts_data.files block_0_int_weight_1 = torch.from_numpy(int_wts_data["aie.layer1.conv1.int_weight"]) block_0_int_weight_2 = torch.from_numpy(int_wts_data["aie.layer1.conv2.int_weight"]) block_0_int_weight_3 = torch.from_numpy(int_wts_data["aie.layer1.conv3.int_weight"]) - block_0_int_weight_skip = torch.from_numpy(int_wts_data["aie.layer1.shortcut.0.int_weight"]) + block_0_int_weight_skip = torch.from_numpy( + int_wts_data["aie.layer1.shortcut.0.int_weight"] + ) block_1_int_weight_1 = torch.from_numpy(int_wts_data["aie.layer2.conv1.int_weight"]) block_1_int_weight_2 = torch.from_numpy(int_wts_data["aie.layer2.conv2.int_weight"]) @@ -232,20 +240,20 @@ def main(opts): param.data.fill_(0) block_0_combined_scale1 = -math.log( - init_scale * block_0_weight_scale_1 / block_0_relu_1, 2 + init_scale * block_0_weight_scale_1 / block_0_relu_1, 2 ) # after conv1x1 block_0_combined_scale2 = -math.log( - block_0_relu_1 * block_0_weight_scale_2 / block_0_relu_2, 2 + block_0_relu_1 * block_0_weight_scale_2 / block_0_relu_2, 2 ) # after conv3x3 block_0_combined_scale3 = -math.log( - block_0_relu_2 * block_0_weight_scale_3 / block_0_add_scale, 2 + block_0_relu_2 * block_0_weight_scale_3 / block_0_add_scale, 2 ) # after conv1x1 block_0_combined_scale4 = -math.log( - block_0_add_scale / block_0_relu_3, 2 + block_0_add_scale / block_0_relu_3, 2 ) # after skip addition using init scale # combined_scale4=-math.log(inp_scale1/inp_scale4) block_0_combined_scale_skip = -math.log( - init_scale * block_0_weight_scale_skip / block_0_add_scale, 2 + init_scale * block_0_weight_scale_skip / block_0_add_scale, 2 ) # after LHS conv1x1 block_1_combined_scale1 = -math.log( @@ -278,14 +286,18 @@ def main(opts): print("Block0 combined_scale after first conv1x1:", block_0_combined_scale1) print("Block0 combined_scale after second conv3x3:", block_0_combined_scale2) print("Block0 combined_scale after third conv1x1:", block_0_combined_scale3) - print("Block0 combined_scale after adding skip connection:", (block_0_combined_scale4)) + print( + "Block0 combined_scale after adding skip connection:", (block_0_combined_scale4) + ) print("Block0 combined_scale after skip conv1x1:", block_0_combined_scale_skip) print("--------------------------------------------------------------") print("Block1 combined_scale after first conv1x1:", block_1_combined_scale1) print("Block1 combined_scale after second conv3x3:", block_1_combined_scale2) print("Block1 combined_scale after third conv1x1:", block_1_combined_scale3) - print("Block1 combined_scale after adding skip connection:", (block_1_combined_scale4)) + print( + "Block1 combined_scale after adding skip connection:", (block_1_combined_scale4) + ) print("--------------------------------------------------------------") print("Block2 combined_scale block2 after first conv1x1:", block_2_combined_scale1) print("Block2 combined_scale block2 after second conv3x3:", block_2_combined_scale2) @@ -295,7 +307,7 @@ def main(opts): (block_2_combined_scale4), ) print("------------------------------------------------------------------") - # ------------------------------------------------------ + # ------------------------------------------------------ # Get device, load the xclbin & kernel and register them # ------------------------------------------------------ app = setup_aie( @@ -387,7 +399,6 @@ def main(opts): im_name = f"./cifar_images/image_{i}.png" cv2.imwrite(im_name, im) - label_path = "data/cifar10_label_map.txt" model_num_classes = 10 class_label_map = load_class_label(label_path, model_num_classes) @@ -396,13 +407,16 @@ def main(opts): ) quant_id_1.eval() - # ------------------------------------------------------ # Main run loop # ------------------------------------------------------ - + for i in range(0, 64): - print("____________________________________IMAGE {}____________________________________________".format(i)) + print( + "____________________________________IMAGE {}____________________________________________".format( + i + ) + ) image_name = f"./cifar_images/image_{i}.png" img = Image.open(image_name) input_tensor = transform_test(img) @@ -443,10 +457,14 @@ def main(opts): # Calculate the five categories with the highest classification probability prediction_class_index = ( - torch.topk(final_output_aie, k=5, sorted=True).indices.squeeze(0).tolist() + torch.topk(final_output_aie, k=5, sorted=True) + .indices.squeeze(0) + .tolist() ) golden_prediction_class_index = ( - torch.topk(final_output_base, k=5, sorted=True).indices.squeeze(0).tolist() + torch.topk(final_output_base, k=5, sorted=True) + .indices.squeeze(0) + .tolist() ) npu_time = stop - start npu_time_total = npu_time_total + npu_time @@ -456,12 +474,13 @@ def main(opts): # ------------------------------------------------------ print("\nAvg NPU time: {}us.".format(int((npu_time_total / 64) / 1000))) for x, y in zip(predicted_label, predicted_label): - if x != y: - print("\nFailed.\n") - exit(-1) + if x != y: + print("\nFailed.\n") + exit(-1) print("\nPASS!\n") exit(0) + if __name__ == "__main__": p = test_utils.create_default_argparser() opts = p.parse_args(sys.argv[1:]) diff --git a/programming_examples/ml/resnet/ptq_conv2x/utils.py b/programming_examples/ml/resnet/ptq_conv2x/utils.py index 21a12f45c7..8ac8e93b8d 100644 --- a/programming_examples/ml/resnet/ptq_conv2x/utils.py +++ b/programming_examples/ml/resnet/ptq_conv2x/utils.py @@ -2,6 +2,7 @@ import cv2 import numpy as np + def unpickle(file): import pickle @@ -37,4 +38,4 @@ def extract_cifar(): im = im.transpose(1, 2, 0) im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR) im_name = f"./cifar_images/image_{i}.png" - cv2.imwrite(im_name, im) \ No newline at end of file + cv2.imwrite(im_name, im) From eccf94f0b3a41d896b0e629b2e45cb5a2a8c8998 Mon Sep 17 00:00:00 2001 From: Gagandeep Singh Date: Tue, 30 Apr 2024 11:24:59 -0600 Subject: [PATCH 4/8] disable ptq --- programming_examples/ml/resnet/ptq_conv2x/run_makefile.lit | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programming_examples/ml/resnet/ptq_conv2x/run_makefile.lit b/programming_examples/ml/resnet/ptq_conv2x/run_makefile.lit index 6097345491..4d506d9c43 100644 --- a/programming_examples/ml/resnet/ptq_conv2x/run_makefile.lit +++ b/programming_examples/ml/resnet/ptq_conv2x/run_makefile.lit @@ -1,7 +1,7 @@ // (c) Copyright 2024 Advanced Micro Devices, Inc. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// REQUIRES: ryzen_ai, chess, torch +// REQUIRES: ryzen_ai, chess, torch, dontrun // // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile From cc9c43bd4a8f5ba4eb2fdeca2d8d0c97573a9fad Mon Sep 17 00:00:00 2001 From: Gagandeep Singh Date: Mon, 27 May 2024 20:22:34 -0600 Subject: [PATCH 5/8] PTQ readme --- .../ml/resnet/ptq_conv2x/CMakeLists.txt | 89 ------------------- .../ml/resnet/ptq_conv2x/Makefile | 2 +- .../ml/resnet/ptq_conv2x/README.md | 76 ++++++++++++++++ .../ml/resnet/ptq_conv2x/test.py | 16 ++-- 4 files changed, 85 insertions(+), 98 deletions(-) delete mode 100755 programming_examples/ml/resnet/ptq_conv2x/CMakeLists.txt create mode 100644 programming_examples/ml/resnet/ptq_conv2x/README.md diff --git a/programming_examples/ml/resnet/ptq_conv2x/CMakeLists.txt b/programming_examples/ml/resnet/ptq_conv2x/CMakeLists.txt deleted file mode 100755 index c7db0e9c5c..0000000000 --- a/programming_examples/ml/resnet/ptq_conv2x/CMakeLists.txt +++ /dev/null @@ -1,89 +0,0 @@ -# This file is licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# Copyright (C) 2024, Advanced Micro Devices, Inc. - -# parameters -# -DBOOST_ROOT: Path to Boost install -# -DOpenCV_DIR: Path to OpenCV install -# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo -# -DXRT_LIB_DIR: Path to xrt_coreutil.lib -# -DTARGET_NAME: Target name to be built - -# cmake needs this line -cmake_minimum_required(VERSION 3.1) - -find_program(WSL NAMES powershell.exe) - -if (NOT WSL) - set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") - set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install") - set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") -else() - set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") - set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install") - set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") -endif () - -set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width") -set(EDGEDETECT_HEIGHT 1080 CACHE STRING "image height") - -set(TARGET_NAME test CACHE STRING "Target to be built") - -SET (ProjectName ${TARGET_NAME}) -SET (currentTarget ${TARGET_NAME}) - -if ( WSL ) - set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) -endif () - -project(${ProjectName}) - -# Find packages -find_package(Boost REQUIRED) -find_package(OpenCV REQUIRED) -message("opencv library paht: ${OpenCV_LIB_PATH}") -message("opencv libs: ${OpenCV_LIBS}") - - -add_executable(${currentTarget} - ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/OpenCVUtils.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/xrtUtils.cpp - test.cpp -) - -target_compile_definitions(${currentTarget} PUBLIC - EDGEDETECT_WIDTH=${EDGEDETECT_WIDTH} - EDGEDETECT_HEIGHT=${EDGEDETECT_HEIGHT} - DISABLE_ABI_CHECK=1 - ) - -target_include_directories (${currentTarget} PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils - ${XRT_INC_DIR} - ${OpenCV_INCLUDE_DIRS} - ${Boost_INCLUDE_DIRS} -) - -target_link_directories(${currentTarget} PUBLIC - ${XRT_LIB_DIR} - ${OpenCV_LIB_PATH} - ${Boost_LIBRARY_DIRS} -) - -if (NOT WSL) - target_link_libraries(${currentTarget} PUBLIC - xrt_coreutil - ${OpenCV_LIBS} - boost_program_options - boost_filesystem - ) -else() - target_link_libraries(${currentTarget} PUBLIC - xrt_coreutil - ${OpenCV_LIBS} - ) -endif() diff --git a/programming_examples/ml/resnet/ptq_conv2x/Makefile b/programming_examples/ml/resnet/ptq_conv2x/Makefile index 4b40c07da9..79e443d308 100755 --- a/programming_examples/ml/resnet/ptq_conv2x/Makefile +++ b/programming_examples/ml/resnet/ptq_conv2x/Makefile @@ -43,7 +43,7 @@ build/final.xclbin: build/${mlirFileName}.mlir build/conv2dk1_i8.o build/conv2dk clean: rm -rf build/*.elf* build/*.lst build/*.bif log* build/${mlirFileName}.mlir.prj build/*.xclbin sim \ build/chess* build/insts.txt \ - build/*.log build/aie_partition.json build/*.bin build/BOOT.BIN _x test.exe + build/*.log build/aie_partition.json build/*.bin build/BOOT.BIN _x run_py: ${powershell} python3 ${srcdir}/test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE diff --git a/programming_examples/ml/resnet/ptq_conv2x/README.md b/programming_examples/ml/resnet/ptq_conv2x/README.md new file mode 100644 index 0000000000..f1e68a3b3e --- /dev/null +++ b/programming_examples/ml/resnet/ptq_conv2x/README.md @@ -0,0 +1,76 @@ + + +# ResNet with Offloaded Conv2_x Layers and Post-Training Quantization + +Quantization involves reducing the precision of the weights and activations of a neural network from floating-point (e.g., 32-bit float) to lower bit-width formats (e.g., 8-bit integers). Quantization reduces model size and speeds up inference, making it more suitable for deployment on resource-constrained devices. In AI Engine (AIE), we use a power-of-two scale factor to set up the SRS to shift and scale the values to the integer range. A power of two is a number of the form 2^n, where n is an integer. Power-of-two scale factors can lead to more efficient hardware implementations, as multiplication by a power of two can be performed using bit shifts rather than more complex multiplication operations. + +[Brevitas](https://github.com/Xilinx/brevitas) is a PyTorch-based library designed for quantization of neural networks. It enables users to train models with reduced numerical precision, typically using lower bit widths for weights and activations, which can lead to significant improvements in computational efficiency and memory usage. Brevitas supports various quantization schemes, including uniform and non-uniform quantization, and can be used to target a wide range of hardware platforms, including FPGAs, ASICs, and CPUs. We use Brevitas to: +1. Quantize weights and activations of a model to lower bit format for AIE deployment, and +2. Extract proper power-of-two scale factors to set up the SRS unit. + +## Source Files Overview + +``` +. ++-- ptq_conv2x # Implementation of ResNet conv2_x layers on NPU with PTQ ++-- +-- data # Labels for CIFAR dataset. +| +-- aie2.py # A Python script that defines the AIE array structural design using MLIR-AIE operations. +| +-- Makefile # Contains instructions for building and compiling software projects. +| +-- model.py # Python code for ResNet Model where we apply PTQ. +| +-- README.md # This file. +| +-- requirements.txt # pip requirements to perform PTQ. +| +-- run_makefile.lit # For LLVM Integrated Tester (LIT) of the design. +| +-- test.py # Python code testbench for the design example. +| +-- utils.py # Python code for miscellaneous functions needed for inference. + + +``` + +# Post-Training Quantization Using Brevitas +To enhance the efficiency of our implementation, we perform post-training quantization on the model using the Brevitas library. This step converts the model to use 8-bit weights and power-of-two scale factors, optimizing it for deployment on hardware with limited precision requirements. + + +## Step-by-Step Process +We use test.py to: + +**1. Loading the Pre-trained ResNet Model**: The script begins by loading a pre-trained ResNet model, which serves as the baseline for quantization and inference. + +**2. Applying Post-Training Quantization (PTQ)**: Using the Brevitas library, the script applies PTQ to the conv2_x layers of the ResNet model. This involves converting the weights and activations to 8-bit precision. + +**3. Extracting Power-of-Two Scale Factors**: After quantizing the weights and activations, the script extracts the power-of-two scale factors. These factors are crucial for efficient hardware implementation, as they simplify multiplication operations to bit shifts. + +**4. Calculating Combined Scales**: The combined scale factors are calculated by multiplying the extracted weight and activation scales for each layer. These combined scales are then used to set up the SRS unit. + +**5. Setting Up the SRS Unit**: +The SRS unit uses the calculated combined scales to efficiently shift and scale the values to the integer range required for the NPU. + +**6. Running Inference**: Finally, the script runs inference on the quantized model. The conv2_x layers are offloaded to the NPU, utilizing the SRS unit to scale the quantized weights and activations to the int8 range properly. + +# Compilation and Execution + +## Prerequisites +Ensure you have the necessary dependencies installed. You can install the required packages using: + +``` +pip install -r requirements.txt +``` +## Compilation +To compile the design: +``` +make +``` + +## Running the Design + +To run the design: +``` +make run_py +``` diff --git a/programming_examples/ml/resnet/ptq_conv2x/test.py b/programming_examples/ml/resnet/ptq_conv2x/test.py index e9278132b5..67257625a9 100755 --- a/programming_examples/ml/resnet/ptq_conv2x/test.py +++ b/programming_examples/ml/resnet/ptq_conv2x/test.py @@ -70,6 +70,7 @@ def main(opts): # ------------------------------------------------------ # Post training quantization to get int8 weights and activation for AIE # ------------------------------------------------------ + # Step 1: Load the pre-trained ResNet model num_classes = 10 model = res.Resnet50_conv2x_offload(num_classes) weights = "trained_resnet50/weight.tar" # trained FP model @@ -109,7 +110,7 @@ def main(opts): root=data_dir, train=False, transform=transform_test, download=True ) - # Data loader + # Data loader for calibration indices = torch.arange(256) tr_sub = data_utils.Subset(train_dataset, indices) val_sub = data_utils.Subset(test_dataset, indices) @@ -119,6 +120,8 @@ def main(opts): val_loader = torch.utils.data.DataLoader( dataset=val_sub, batch_size=64, shuffle=False ) + + # Step 2: Apply quantization to the conv2_x layers to convert weights to 8-bit precision img_shape = 32 model_aie = preprocess_for_flexml_quantize( model.aie, @@ -131,7 +134,7 @@ def main(opts): quant_model = quantize_model( model_aie, backend="flexml", - scale_factor_type="po2_scale", + scale_factor_type="po2_scale", # Ensuring scale factors are powers of two bias_bit_width=32, weight_bit_width=8, weight_narrow_range=False, @@ -165,29 +168,25 @@ def main(opts): from numpy import load + # Extracting quantized weights and scale factors params = {} weights = {} for name, module in model.named_modules(): if isinstance(module, QuantConv2d): - # print(name) - # print(module.quant_weight().scale) weights[name + ".int_weight"] = module.quant_weight().int( float_datatype=False ) params[name + "_scale"] = module.quant_weight().scale.detach().numpy() if isinstance(module, QuantIdentity): - # print(name) - # print(module.quant_act_scale()) params[name + "_scale"] = module.quant_act_scale() if isinstance(module, QuantReLU): - # print(name) - # print(module.quant_act_scale()) params[name + "_scale"] = module.quant_act_scale() np.savez(os.path.join(os.getcwd(), "int_weights.npz"), **weights) np.savez(os.path.join(os.getcwd(), "int_conv_scale.npz"), **params) int_wts_data = load("int_weights.npz", allow_pickle=True) int_scale_data = load("int_conv_scale.npz", allow_pickle=True) + # Loading weights and scales int_wts_data_lst = int_wts_data.files block_0_int_weight_1 = torch.from_numpy(int_wts_data["aie.layer1.conv1.int_weight"]) block_0_int_weight_2 = torch.from_numpy(int_wts_data["aie.layer1.conv2.int_weight"]) @@ -239,6 +238,7 @@ def main(opts): if name.endswith(".bias"): param.data.fill_(0) + # Calculate combined scales block_0_combined_scale1 = -math.log( init_scale * block_0_weight_scale_1 / block_0_relu_1, 2 ) # after conv1x1 From d3bd1867000dfe5af874897259f01de913daab73 Mon Sep 17 00:00:00 2001 From: Gagandeep Singh Date: Mon, 27 May 2024 20:26:22 -0600 Subject: [PATCH 6/8] PTQ readme --- programming_examples/ml/resnet/ptq_conv2x/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/programming_examples/ml/resnet/ptq_conv2x/README.md b/programming_examples/ml/resnet/ptq_conv2x/README.md index f1e68a3b3e..a3a2d7647c 100644 --- a/programming_examples/ml/resnet/ptq_conv2x/README.md +++ b/programming_examples/ml/resnet/ptq_conv2x/README.md @@ -10,9 +10,9 @@ # ResNet with Offloaded Conv2_x Layers and Post-Training Quantization -Quantization involves reducing the precision of the weights and activations of a neural network from floating-point (e.g., 32-bit float) to lower bit-width formats (e.g., 8-bit integers). Quantization reduces model size and speeds up inference, making it more suitable for deployment on resource-constrained devices. In AI Engine (AIE), we use a power-of-two scale factor to set up the SRS to shift and scale the values to the integer range. A power of two is a number of the form 2^n, where n is an integer. Power-of-two scale factors can lead to more efficient hardware implementations, as multiplication by a power of two can be performed using bit shifts rather than more complex multiplication operations. +Quantization involves reducing the precision of the weights and activations of a neural network from floating-point (e.g., 32-bit float) to lower bit-width formats (e.g., 8-bit integer). Quantization reduces model size and speeds up inference, making a model more suitable for deployment on resource-constrained devices. In AI Engine (AIE), we use a power-of-two scale factor for the SRS (Shift-Round-Saturate) Unit to shift and scale the values to an integer range. A power of two is a number of the form 2^n, where n is an integer. Power-of-two scale factors can lead to more efficient hardware implementations, as multiplication by a power of two can be performed using bit shifts rather than more complex multiplication operations. -[Brevitas](https://github.com/Xilinx/brevitas) is a PyTorch-based library designed for quantization of neural networks. It enables users to train models with reduced numerical precision, typically using lower bit widths for weights and activations, which can lead to significant improvements in computational efficiency and memory usage. Brevitas supports various quantization schemes, including uniform and non-uniform quantization, and can be used to target a wide range of hardware platforms, including FPGAs, ASICs, and CPUs. We use Brevitas to: +[Brevitas](https://github.com/Xilinx/brevitas) is a PyTorch-based library designed for quantization of neural networks. It enables users to train models with reduced numerical precision, typically using lower bit widths for weights and activations, which can lead to significant improvements in computational efficiency and memory usage. Brevitas supports various quantization schemes, including uniform and non-uniform quantization, and can be used to target a wide range of hardware platforms, including FPGAs, ASICs, and AIEs. We use Brevitas to: 1. Quantize weights and activations of a model to lower bit format for AIE deployment, and 2. Extract proper power-of-two scale factors to set up the SRS unit. From 896e0d66156c25036f8804466ce0fd1f4d553f6f Mon Sep 17 00:00:00 2001 From: Gagandeep Singh Date: Mon, 27 May 2024 20:27:54 -0600 Subject: [PATCH 7/8] black fix --- programming_examples/ml/resnet/ptq_conv2x/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/programming_examples/ml/resnet/ptq_conv2x/test.py b/programming_examples/ml/resnet/ptq_conv2x/test.py index 67257625a9..175cfb8fab 100755 --- a/programming_examples/ml/resnet/ptq_conv2x/test.py +++ b/programming_examples/ml/resnet/ptq_conv2x/test.py @@ -134,7 +134,7 @@ def main(opts): quant_model = quantize_model( model_aie, backend="flexml", - scale_factor_type="po2_scale", # Ensuring scale factors are powers of two + scale_factor_type="po2_scale", # Ensuring scale factors are powers of two bias_bit_width=32, weight_bit_width=8, weight_narrow_range=False, @@ -238,7 +238,7 @@ def main(opts): if name.endswith(".bias"): param.data.fill_(0) - # Calculate combined scales + # Calculate combined scales block_0_combined_scale1 = -math.log( init_scale * block_0_weight_scale_1 / block_0_relu_1, 2 ) # after conv1x1 From c1d477953d820591fe18fe1b9d6a82a9932dabc6 Mon Sep 17 00:00:00 2001 From: Gagandeep Singh Date: Mon, 27 May 2024 21:49:55 -0600 Subject: [PATCH 8/8] PTQ readme update --- programming_examples/ml/resnet/ptq_conv2x/README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/programming_examples/ml/resnet/ptq_conv2x/README.md b/programming_examples/ml/resnet/ptq_conv2x/README.md index a3a2d7647c..d5f87d2a9a 100644 --- a/programming_examples/ml/resnet/ptq_conv2x/README.md +++ b/programming_examples/ml/resnet/ptq_conv2x/README.md @@ -12,7 +12,9 @@ Quantization involves reducing the precision of the weights and activations of a neural network from floating-point (e.g., 32-bit float) to lower bit-width formats (e.g., 8-bit integer). Quantization reduces model size and speeds up inference, making a model more suitable for deployment on resource-constrained devices. In AI Engine (AIE), we use a power-of-two scale factor for the SRS (Shift-Round-Saturate) Unit to shift and scale the values to an integer range. A power of two is a number of the form 2^n, where n is an integer. Power-of-two scale factors can lead to more efficient hardware implementations, as multiplication by a power of two can be performed using bit shifts rather than more complex multiplication operations. -[Brevitas](https://github.com/Xilinx/brevitas) is a PyTorch-based library designed for quantization of neural networks. It enables users to train models with reduced numerical precision, typically using lower bit widths for weights and activations, which can lead to significant improvements in computational efficiency and memory usage. Brevitas supports various quantization schemes, including uniform and non-uniform quantization, and can be used to target a wide range of hardware platforms, including FPGAs, ASICs, and AIEs. We use Brevitas to: +[Brevitas](https://github.com/Xilinx/brevitas) is a PyTorch-based library designed for quantization of neural networks. It enables users to train models with reduced numerical precision, typically using lower bit widths for weights and activations, which can lead to significant improvements in computational efficiency and memory usage. Brevitas supports various quantization schemes, including uniform and non-uniform quantization, and can be used to target a wide range of hardware platforms, including FPGAs, ASICs, and AIEs. + +We use Brevitas to: 1. Quantize weights and activations of a model to lower bit format for AIE deployment, and 2. Extract proper power-of-two scale factors to set up the SRS unit. @@ -20,7 +22,7 @@ Quantization involves reducing the precision of the weights and activations of a ``` . -+-- ptq_conv2x # Implementation of ResNet conv2_x layers on NPU with PTQ ++-- ptq_conv2x # Implementation of ResNet conv2_x layers on NPU with PTQ. +-- +-- data # Labels for CIFAR dataset. | +-- aie2.py # A Python script that defines the AIE array structural design using MLIR-AIE operations. | +-- Makefile # Contains instructions for building and compiling software projects. @@ -35,11 +37,11 @@ Quantization involves reducing the precision of the weights and activations of a ``` # Post-Training Quantization Using Brevitas -To enhance the efficiency of our implementation, we perform post-training quantization on the model using the Brevitas library. This step converts the model to use 8-bit weights and power-of-two scale factors, optimizing it for deployment on hardware with limited precision requirements. +To enhance the efficiency of our implementation, we perform post-training quantization on the model using the Brevitas library. This step converts the model to use 8-bit weights and activations and power-of-two scale factors, optimizing it for deployment on hardware with limited precision requirements. ## Step-by-Step Process -We use test.py to: +We use test.py to perform the following key tasks: **1. Loading the Pre-trained ResNet Model**: The script begins by loading a pre-trained ResNet model, which serves as the baseline for quantization and inference.