ptq for offloaded resnet conv2x

Xilinx · Apr 30, 2024 · c1993b0 · c1993b0
1 parent 619ff90
commit c1993b0
Show file tree

Hide file tree

Showing 7 changed files with 560 additions and 372 deletions.
diff --git a/programming_examples/ml/resnet/ptq_conv2x/aie2.py b/programming_examples/ml/resnet/ptq_conv2x/aie2.py
@@ -580,7 +580,10 @@ def core_body():
 
                 @core(cores[i][1], "conv2dk3.o")
                 def core_body():
-                    scale = 11
+                    if(i==2):
+                        scale = 9
+                    else:
+                        scale = 9
                     for _ in for_(sys.maxsize):
 
                         # acquire weights and rtps once
@@ -697,7 +700,10 @@ def core_body():
 
                 @core(cores[i][3], "conv2dk3.o")
                 def core_body():
-                    scale = 11
+                    if(i==2):
+                        scale = 9
+                    else:
+                        scale = 9
                     for _ in for_(sys.maxsize):
 
                         # acquire weights and rtps once
@@ -930,30 +936,28 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
                 # for c, col in enumerate(rtp_name):
                 #     for r, row in enumerate(col):
                 #         NpuWriteRTPOp(row, col=c, row=r + 2, index=0, value=1)  # scale
-
-                # NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=1, value=0)
-                # NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=2, value=1)
-
-                # NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=1, value=0)
-
-                # NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=1, value=0)
-
-                # #     # write RTP parameters
-                # npuWriteRTPOp(
-                #     "rtpComputeTile02", col=0, row=2, index=0, value=1
-                # )  # scale
-                # npuWriteRTPOp(
-                #     "rtpComputeTile03", col=0, row=3, index=0, value=1
-                # )  # scale
-                # npuWriteRTPOp(
-                #     "rtpComputeTile05", col=0, row=5, index=0, value=1
-                # )  # scale
-                # npuWriteRTPOp(
-                #     "rtpComputeTile04", col=0, row=4, index=0, value=1
-                # )  # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input
-                # npuWriteRTPOp(
-                #     "rtpComputeTile04", col=0, row=4, index=1, value=0
-                # )  # skip_scale
+                NpuWriteRTPOp("rtpComputeTile02", col=0, row=2, index=0, value=8)
+                NpuWriteRTPOp("rtpComputeTile03", col=0, row=3, index=0, value=9)
+                NpuWriteRTPOp("rtpComputeTile04", col=0, row=5, index=0, value=9)
+                NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=0, value=11)
+                NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=1, value=0)
+                NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=2, value=7)
+
+                NpuWriteRTPOp("rtpComputeTile15", col=1, row=5, index=0, value=9)
+                NpuWriteRTPOp("rtpComputeTile14", col=1, row=4, index=0, value=9)
+                NpuWriteRTPOp("rtpComputeTile12", col=1, row=2, index=0, value=9)
+                NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=0, value=12)
+                NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=1, value=0)
+
+                NpuWriteRTPOp("rtpComputeTile22", col=2, row=2, index=0, value=9)
+                NpuWriteRTPOp("rtpComputeTile23", col=2, row=3, index=0, value=9)
+                NpuWriteRTPOp("rtpComputeTile25", col=2, row=5, index=0, value=9)        
+                NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=0, value=12)
+                NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=1, value=0)
+
+                rtp_1=[7,10,13,-2,10]
+                rtp_2=[8,10,12]
+                rtp_3=[9,9,12]
 
                 npu_dma_memcpy_nd(
                     metadata="act1_00_02_01",

diff --git a/programming_examples/ml/resnet/ptq_conv2x/data/cifar10_label_map.txt b/programming_examples/ml/resnet/ptq_conv2x/data/cifar10_label_map.txt
@@ -0,0 +1 @@
+{"0": "airplane", "1": "automobile", "2": "bird", "3": "cat", "4": "deer", "5": "dog", "6": "frog", "7": "horse", "8": "ship", "9": "truck"}
diff --git a/programming_examples/ml/resnet/ptq_conv2x/model.py b/programming_examples/ml/resnet/ptq_conv2x/model.py
@@ -0,0 +1,151 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class CombinedModel(nn.Module):
+    def __init__(self, first, aie, post):
+        super(CombinedModel, self).__init__()
+        self.first = first
+        self.aie = aie
+        self.post = post
+
+    def forward(self, x):
+        x = self.first(x)
+        x = self.aie(x)
+        x = self.post(x)
+        return x
+
+class PreAIELayers(nn.Module):
+    def __init__(self):
+        super(PreAIELayers, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.bn1(out)
+        # print( out)
+        out = F.relu(out)
+        return out
+
+
+class AIEConv2xOffload(nn.Module):
+    def __init__(self, block, num_blocks):
+        super(AIEConv2xOffload, self).__init__()
+        self.in_planes = 64
+        self.layer1 = block(in_planes=64, planes=64)
+        self.layer2 = block(in_planes=256, planes=64)
+        self.layer3 = block(in_planes=256, planes=64)
+
+    def forward(self, x):
+        out = self.layer1(x)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        return out
+
+
+class PostAIELayers(nn.Module):
+    def __init__(self, block, num_blocks, num_classes):
+        super(PostAIELayers, self).__init__()
+
+        self.in_planes = 256
+        self.layer2 = self._make_layer(block, 128, num_blocks[0], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[1], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[2], stride=2)
+        self.linear = nn.Linear(512 * block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = self.layer2(x)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 32)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+class Bottleneck_projected(nn.Module):
+    expansion = 4
+
+    def __init__(self, in_planes, planes, stride=1, option="A"):
+        super(Bottleneck_projected, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
+        )
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(
+            planes, self.expansion * planes, kernel_size=1, bias=False
+        )
+        self.bn3 = nn.BatchNorm2d(self.expansion * planes)
+        self.relu1 = nn.ReLU()
+        self.relu2 = nn.ReLU()
+        self.relu3 = nn.ReLU()
+
+        self.shortcut = nn.Sequential()
+        if in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(
+                    in_planes, self.expansion * planes, kernel_size=1, bias=False
+                ),
+                nn.BatchNorm2d(self.expansion * planes),
+            )
+    def forward(self, x):
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        out = out + self.shortcut(x)
+        out = self.relu3(out)
+        return out
+
+class Bottleneck_fused_projected(nn.Module):
+    expansion = 4
+
+    def __init__(self, in_planes, planes, stride=1, option="A"):
+        super(Bottleneck_fused_projected, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+
+        self.conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
+        )
+
+        self.conv3 = nn.Conv2d(
+            planes, self.expansion * planes, kernel_size=1, bias=False
+        )
+
+        self.relu1 = nn.ReLU()
+        self.relu2 = nn.ReLU()
+        self.relu3 = nn.ReLU()
+
+        self.shortcut = nn.Sequential()
+        if in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, bias=False)
+            )
+
+    def forward(self, x):
+        out = self.relu1((self.conv1(x)))
+        out = self.relu2((self.conv2(out)))
+        out = self.conv3(out)
+        out += self.shortcut(x)
+        out = self.relu3(out)
+        return out
+
+def Resnet50_conv2x_offload(num_classes):
+    return CombinedModel(
+        PreAIELayers(),
+        AIEConv2xOffload(
+            Bottleneck_fused_projected,
+            [
+                1,
+            ],
+        ), 
+        PostAIELayers(Bottleneck_projected, [4, 6, 3], num_classes),
+    )
diff --git a/programming_examples/ml/resnet/ptq_conv2x/requirements.txt b/programming_examples/ml/resnet/ptq_conv2x/requirements.txt
@@ -0,0 +1,4 @@
+brevitas
+torchvision
+tqdm
+opencv-python
diff --git a/programming_examples/ml/resnet/ptq_conv2x/run_makefile.lit b/programming_examples/ml/resnet/ptq_conv2x/run_makefile.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess, torch
+//
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile 
+// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s
+// CHECK: PASS!