PaddlePaddle · hong19860320 · Mar 29, 2024 · Apr 2, 2024 · Apr 7, 2024
@@ -63,7 +63,7 @@ lite_option(LITE_WITH_NNADAPTER                "Enable NNAdapter in lite mode"
 lite_option(LITE_WITH_XPU                      "Enable XPU in lite mode"                                              OFF)
 lite_option(XPU_WITH_XFT                       "Enable XPU-XFT in lite mode"                                          OFF)
 lite_option(LITE_WITH_TRAIN                    "Enable training operators and kernels in lite"                        OFF)
-lite_option(LITE_WITH_OPENMP                   "Enable OpenMP in lite framework"                                      ON)
+lite_option(LITE_WITH_OPENMP                   "Enable OpenMP in lite framework"                                      OFF)
 lite_option(LITE_WITH_OPENCL                   "Enable OpenCL support in lite"                                        OFF)
 lite_option(LITE_WITH_METAL                    "Enable Metal support in lite"                                         OFF)
 lite_option(LITE_WITH_PROFILE                  "Enable profile mode in lite framework"                                OFF)

@@ -38,8 +38,8 @@ if(ARMLINUX_ARCH_ABI STREQUAL "armv7")
 endif()
 if(ARMLINUX_ARCH_ABI STREQUAL "armv7hf")
     set(CMAKE_SYSTEM_PROCESSOR arm)
-    set(CMAKE_C_COMPILER "arm-linux-gnueabihf-gcc")
-    set(CMAKE_CXX_COMPILER "arm-linux-gnueabihf-g++")
+    set(CMAKE_C_COMPILER "/opt/toolchain-sunxi-musl/bin/arm-openwrt-linux-muslgnueabi-gcc")
+    set(CMAKE_CXX_COMPILER "/opt/toolchain-sunxi-musl/bin/arm-openwrt-linux-muslgnueabi-g++")
 endif()
 set(HOST_C_COMPILER $ENV{CC})
 set(HOST_CXX_COMPILER $ENV{CXX})

@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "memory_info.h"
+#include "memory_info.h"  // NOLINT
 
 #ifdef __linux__
 #include <malloc.h>
@@ -40,13 +40,15 @@ MemoryUsage GetMemoryUsage() {
   if (getrusage(RUSAGE_SELF, &res) == 0) {
     result.max_rss_kb = res.ru_maxrss;
   }
+/*
 #if defined(__GLIBC__) && __GLIBC_MINOR__ >= 33
   const auto mem = mallinfo2();
 #else
   const auto mem = mallinfo();
 #endif
   result.total_allocated_bytes = mem.arena;
   result.in_use_allocated_bytes = mem.uordblks;
+*/
 #endif
   return result;
 }
@@ -59,6 +61,6 @@ void MemoryUsage::AllStatsToStream(std::ostream* stream) const {
           << in_use_allocated_bytes / 1024.0 / 1024.0 << " MB";
 }
 
-}  // namespace paddle
-}  // namespace lite_api
 }  // namespace profile
+}  // namespace lite_api
+}  // namespace paddle
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <algorithm>
+#include <cstdint>
 #include <utility>
 #include <vector>
 

@@ -32,6 +32,16 @@ namespace verisilicon_timvx {
 #undef REGISTER_CONVERTER
 
 int Converter::Apply(core::Model* model) {
+  // Create the input and output tensors in advance so that the input and output
+  // remain in their original order when saving the NBG model on some machines.
+  auto input_count = model->input_operands.size();
+  for (size_t i = 0; i < input_count; i++) {
+    ConvertOperand(model->input_operands[i]);
+  }
+  auto output_count = model->output_operands.size();
+  for (size_t i = 0; i < output_count; i++) {
+    ConvertOperand(model->output_operands[i]);
+  }
   // Convert the NNAdapter operations to the tim-vx operations
   std::vector<core::Operation*> operations =
       SortOperationsInTopologicalOrder(model);
@@ -89,8 +99,11 @@ std::shared_ptr<tim::vx::Tensor> Converter::AddTensor(
 
 std::shared_ptr<tim::vx::Tensor> Converter::ConvertOperand(
     core::Operand* operand, std::vector<int32_t> dimensions) {
-  auto tensor = AddTensor(&operand->type, operand->buffer, dimensions);
-  UpdateTensorMap(operand, tensor);
+  std::shared_ptr<tim::vx::Tensor> tensor = GetMappedTensor(operand);
+  if (!tensor || !IsModelOutputOperand(operand)) {
+    tensor = AddTensor(&operand->type, operand->buffer, dimensions);
+    UpdateTensorMap(operand, tensor);
+  }
   return tensor;
 }
 }  // namespace verisilicon_timvx

@@ -32,7 +32,7 @@ int ConvertExpand(Converter* converter, core::Operation* operation) {
   auto shape_count = output_operand->type.dimensions.count;
   auto shape_data = output_operand->type.dimensions.data;
 
-  std::vector<int32_t> expand_shape;
+  std::vector<uint32_t> expand_shape;
   for (int i = shape_count - 1; i >= 0; i--) {
     expand_shape.push_back(shape_data[i]);
   }

@@ -31,7 +31,7 @@ int ConvertFill(Converter* converter, core::Operation* operation) {
 
   auto shape_count = output_operand->type.dimensions.count;
   auto shape_data = output_operand->type.dimensions.data;
-  std::vector<int32_t> shape;
+  std::vector<uint32_t> shape;
   for (int i = shape_count - 1; i >= 0; i--) {
     shape.push_back(shape_data[i]);
   }

@@ -54,7 +54,7 @@ endif()
 message(STATUS "NNADAPTER_VERISILICON_TIMVX_VIV_SDK_ROOT=${NNADAPTER_VERISILICON_TIMVX_VIV_SDK_ROOT}")
 
 # Remove the -Werror flags to avoid compilation errors 
-set(VERISILICON_TIMVX_PATCH_COMMAND sed -e "s/-Werror//g" -i CMakeLists.txt && sed -e "s/3.14/3.10/g" -i CMakeLists.txt)
+set(VERISILICON_TIMVX_PATCH_COMMAND sed -e "s/-Werror//g" -i CMakeLists.txt && sed -e "s/3.14/3.10/g" -i CMakeLists.txt && sed -e "s/uint /uint32_t /g" -i include/tim/vx/ops/custom_base.h)
 if(CMAKE_SYSTEM_NAME MATCHES "Android")
   # Hack the TIM-VX and change the name of lib 'libArchModelSw.so' to 'libarchmodelSw.so' for Android
   set(VERISILICON_TIMVX_PATCH_COMMAND ${VERISILICON_TIMVX_PATCH_COMMAND} && sed -e "s/libArchModelSw/libarchmodelSw/g" -i cmake/local_sdk.cmake)
@@ -78,7 +78,7 @@ ExternalProject_Add(
                       -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
                       -DEXTERNAL_VIV_SDK=${NNADAPTER_VERISILICON_TIMVX_VIV_SDK_ROOT}
                       -DCMAKE_INSTALL_PREFIX=${VERISILICON_TIMVX_INSTALL_DIR}
-                      -DTIM_VX_ENABLE_TENSOR_CACHE=OFF                      
+                      -DTIM_VX_ENABLE_TENSOR_CACHE=OFF
                       ${CROSS_COMPILE_CMAKE_ARGS}
 )
 

@@ -19,6 +19,8 @@
 #include "driver/verisilicon_timvx/converter/converter.h"
 #include "driver/verisilicon_timvx/optimizer/convert_fill_like_into_mul_add.h"
 #include "driver/verisilicon_timvx/optimizer/convert_meshgrid_into_reshape_expand.h"
+#include "driver/verisilicon_timvx/optimizer/fix_ops.h"
+#include "driver/verisilicon_timvx/optimizer/remove_relu.h"
 #include "driver/verisilicon_timvx/optimizer/unpack_op_fusion.h"
 #include "optimizer/constant_fold_operations.h"
 #include "optimizer/convert_adaptive_pool2d_into_pool2d.h"
@@ -95,6 +97,10 @@ int Program::Build(core::Model* model, core::Cache* cache) {
       for (size_t i = 0; i < input_count; i++) {
         const auto& type = cache->input_types[i];
         input_tensors_[i] = CreateTimVXTensor(graph_.get(), &type);
+        NNADAPTER_VLOG(3) << "Model input[" << i
+                          << "] id=" << input_tensors_[i]->GetId()
+                          << nnadapter::OperandTypeToString(
+                                 &cache->input_types[i]);
         NNADAPTER_CHECK(input_tensors_[i]);
       }
     }
@@ -106,6 +112,10 @@ int Program::Build(core::Model* model, core::Cache* cache) {
     for (size_t i = 0; i < output_count; i++) {
       const auto& type = cache->output_types[i];
       output_tensors_[i] = CreateTimVXTensor(graph_.get(), &type);
+      NNADAPTER_VLOG(3) << "Model output[" << i
+                        << "] id=" << output_tensors_[i]->GetId()
+                        << nnadapter::OperandTypeToString(
+                               &cache->output_types[i]);
       NNADAPTER_CHECK(output_tensors_[i]);
     }
     auto nbg_op = graph_->CreateOperation<tim::vx::ops::NBG>(
@@ -133,7 +143,9 @@ int Program::Build(core::Model* model, core::Cache* cache) {
     FuseSigmoidMulIntoSwish(model);
     ConvertAdaptivePool2dIntoPool2d(model);
     UnpackOpFusion(model);
+    // FixOps(model);
     ConvertQuantizationSymmToAsymm(model);
+    // RemoveRelu(model);
     NNADAPTER_VLOG(5) << "Optimized model:" << std::endl << Visualize(model);
     // Convert a NNAdapter model to a tim-vx graph
     Converter converter(graph_.get(), &tensors_);
@@ -150,6 +162,9 @@ int Program::Build(core::Model* model, core::Cache* cache) {
         NNADAPTER_CHECK(tensors_.find(operand) != tensors_.end());
         input_tensors_[i] = tensors_[operand].front();
         NNADAPTER_CHECK(input_tensors_[i]);
+        NNADAPTER_VLOG(3) << "Model input[" << i
+                          << "] id=" << input_tensors_[i]->GetId()
+                          << nnadapter::OperandTypeToString(&operand->type);
         input_types_[i] = type;
       }
     }
@@ -164,6 +179,9 @@ int Program::Build(core::Model* model, core::Cache* cache) {
       NNADAPTER_CHECK(tensors_.find(operand) != tensors_.end());
       output_tensors_[i] = tensors_[operand].back();
       NNADAPTER_CHECK(output_tensors_[i]);
+      NNADAPTER_VLOG(3) << "Model output[" << i
+                        << "] id=" << output_tensors_[i]->GetId()
+                        << nnadapter::OperandTypeToString(&operand->type);
       output_types_[i] = type;
     }
     // Compile tim-vx graph and serialize to NBG(Network Binary Graph)

@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "driver/verisilicon_timvx/optimizer/fix_ops.h"
+#include <cmath>
+#include <vector>
+#include "utility/debug.h"
+#include "utility/logging.h"
+#include "utility/modeling.h"
+#include "utility/utility.h"
+
+namespace nnadapter {
+namespace verisilicon_timvx {
+
+static void FixResizeLinearNearest(core::Model* model,
+                                   core::Operation* operation) {
+  auto& input_operands = operation->input_operands;
+  auto& output_operands = operation->output_operands;
+  auto output_operand = output_operands[0];
+  auto output_operations = GetOperandConsumers(model, output_operand);
+  auto dummy_operand =
+      AppendUnaryOperation(model, output_operand, NNADAPTER_RELU);
+  UpdateOperationInputOperands(
+      output_operations, output_operand, dummy_operand);
+  UpdateModelOutputOperands(model, output_operand, dummy_operand);
+}
+
+void FixOps(core::Model* model) {
+  std::vector<core::Operation*> operations =
+      SortOperationsInTopologicalOrder(model);
+  for (auto operation : operations) {
+    NNADAPTER_VLOG(5) << "Converting " << OperationTypeToString(operation->type)
+                      << " ...";
+    switch (operation->type) {
+      case NNADAPTER_RESIZE_LINEAR:
+      case NNADAPTER_RESIZE_NEAREST:
+        FixResizeLinearNearest(model, operation);
+        break;
+      default:
+        break;
+    }
+  }
+}
+
+}  // namespace verisilicon_timvx
+}  // namespace nnadapter
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "core/types.h"
+
+namespace nnadapter {
+namespace verisilicon_timvx {
+
+void FixOps(core::Model* model);
+
+}  // namespace verisilicon_timvx
+}  // namespace nnadapter
@@ -0,0 +1,59 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "driver/verisilicon_timvx/optimizer/remove_relu.h"
+#include <algorithm>
+#include <map>
+#include <vector>
+#include "utility/debug.h"
+#include "utility/logging.h"
+#include "utility/micros.h"
+#include "utility/modeling.h"
+#include "utility/utility.h"
+
+namespace nnadapter {
+namespace verisilicon_timvx {
+
+// Convert input(scale,zero_point=128)->relu->output to
+// input(scale,zero_point=0)
+NNADAPTER_EXPORT void RemoveRelu(core::Model* model) {
+  std::vector<core::Operation*> operations =
+      SortOperationsInTopologicalOrder(model);
+  for (auto operation : operations) {
+    if (operation->type != NNADAPTER_RELU) continue;
+    auto relu_input_operand = operation->input_operands[0];
+    auto relu_output_operand = operation->output_operands[0];
+    if (IsModelInputOperand(relu_input_operand)) continue;
+    auto relu_input_consumers = GetOperandConsumers(model, relu_input_operand);
+    if (relu_input_consumers.size() != 1) continue;
+    if (!IsUInt8AsymmPerLayerQuantType(relu_input_operand->type.precision) ||
+        !IsUInt8AsymmPerLayerQuantType(relu_output_operand->type.precision))
+      continue;
+    relu_input_operand->type.asymm_per_layer_params.scale =
+        relu_output_operand->type.asymm_per_layer_params.scale;
+    relu_input_operand->type.asymm_per_layer_params.zero_point = 0;
+    auto relu_output_consumers =
+        GetOperandConsumers(model, relu_output_operand);
+    UpdateOperationInputOperands(
+        relu_output_consumers, relu_output_operand, relu_input_operand);
+    if (IsModelOutputOperand(relu_output_operand)) {
+      UpdateModelOutputOperands(model, relu_output_operand, relu_input_operand);
+    }
+    RemoveOperand(model, relu_output_operand);
+    RemoveOperation(model, operation);
+  }
+}
+
+}  // namespace verisilicon_timvx
+}  // namespace nnadapter
@@ -0,0 +1,25 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "core/types.h"
+
+namespace nnadapter {
+namespace verisilicon_timvx {
+
+void RemoveRelu(core::Model *model);
+
+}  // namespace verisilicon_timvx
+}  // namespace nnadapter
@@ -562,12 +562,8 @@ int set_sched_affinity(const std::vector<int>& cpu_ids) {
 
 #define PD_CPU_ZERO(cpusetp) memset((cpusetp), 0, sizeof(cpu_set_t))
 
-// set affinity for thread
-#ifdef __GLIBC__
+  // set affinity for thread
   pid_t pid = syscall(SYS_gettid);
-#else
-  pid_t pid = gettid();
-#endif
   cpu_set_t mask;
   PD_CPU_ZERO(&mask);
   unsigned int Runmask = 0;