diff --git a/CMakeLists.txt b/CMakeLists.txt index 02023951b1a..3596810c595 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -63,7 +63,7 @@ lite_option(LITE_WITH_NNADAPTER "Enable NNAdapter in lite mode" lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) lite_option(XPU_WITH_XFT "Enable XPU-XFT in lite mode" OFF) lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF) -lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) +lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" OFF) lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF) lite_option(LITE_WITH_METAL "Enable Metal support in lite" OFF) lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF) diff --git a/cmake/os/armlinux.cmake b/cmake/os/armlinux.cmake index ab79359eeb1..f6426fc13aa 100644 --- a/cmake/os/armlinux.cmake +++ b/cmake/os/armlinux.cmake @@ -38,8 +38,8 @@ if(ARMLINUX_ARCH_ABI STREQUAL "armv7") endif() if(ARMLINUX_ARCH_ABI STREQUAL "armv7hf") set(CMAKE_SYSTEM_PROCESSOR arm) - set(CMAKE_C_COMPILER "arm-linux-gnueabihf-gcc") - set(CMAKE_CXX_COMPILER "arm-linux-gnueabihf-g++") + set(CMAKE_C_COMPILER "/opt/toolchain-sunxi-musl/bin/arm-openwrt-linux-muslgnueabi-gcc") + set(CMAKE_CXX_COMPILER "/opt/toolchain-sunxi-musl/bin/arm-openwrt-linux-muslgnueabi-g++") endif() set(HOST_C_COMPILER $ENV{CC}) set(HOST_CXX_COMPILER $ENV{CXX}) diff --git a/lite/api/tools/benchmark/profile/memory_info.cc b/lite/api/tools/benchmark/profile/memory_info.cc index c4227b07246..30061c208fb 100644 --- a/lite/api/tools/benchmark/profile/memory_info.cc +++ b/lite/api/tools/benchmark/profile/memory_info.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "memory_info.h" +#include "memory_info.h" // NOLINT #ifdef __linux__ #include @@ -40,6 +40,7 @@ MemoryUsage GetMemoryUsage() { if (getrusage(RUSAGE_SELF, &res) == 0) { result.max_rss_kb = res.ru_maxrss; } +/* #if defined(__GLIBC__) && __GLIBC_MINOR__ >= 33 const auto mem = mallinfo2(); #else @@ -47,6 +48,7 @@ MemoryUsage GetMemoryUsage() { #endif result.total_allocated_bytes = mem.arena; result.in_use_allocated_bytes = mem.uordblks; +*/ #endif return result; } @@ -59,6 +61,6 @@ void MemoryUsage::AllStatsToStream(std::ostream* stream) const { << in_use_allocated_bytes / 1024.0 / 1024.0 << " MB"; } -} // namespace paddle -} // namespace lite_api } // namespace profile +} // namespace lite_api +} // namespace paddle diff --git a/lite/backends/host/math/topk.h b/lite/backends/host/math/topk.h index f488ad8dfff..02c15617288 100644 --- a/lite/backends/host/math/topk.h +++ b/lite/backends/host/math/topk.h @@ -14,6 +14,7 @@ #pragma once #include +#include #include #include diff --git a/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/converter/converter.cc b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/converter/converter.cc index a7c78a7e247..47eed3693d5 100644 --- a/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/converter/converter.cc +++ b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/converter/converter.cc @@ -32,6 +32,16 @@ namespace verisilicon_timvx { #undef REGISTER_CONVERTER int Converter::Apply(core::Model* model) { + // Create the input and output tensors in advance so that the input and output + // remain in their original order when saving the NBG model on some machines. + auto input_count = model->input_operands.size(); + for (size_t i = 0; i < input_count; i++) { + ConvertOperand(model->input_operands[i]); + } + auto output_count = model->output_operands.size(); + for (size_t i = 0; i < output_count; i++) { + ConvertOperand(model->output_operands[i]); + } // Convert the NNAdapter operations to the tim-vx operations std::vector operations = SortOperationsInTopologicalOrder(model); @@ -89,8 +99,11 @@ std::shared_ptr Converter::AddTensor( std::shared_ptr Converter::ConvertOperand( core::Operand* operand, std::vector dimensions) { - auto tensor = AddTensor(&operand->type, operand->buffer, dimensions); - UpdateTensorMap(operand, tensor); + std::shared_ptr tensor = GetMappedTensor(operand); + if (!tensor || !IsModelOutputOperand(operand)) { + tensor = AddTensor(&operand->type, operand->buffer, dimensions); + UpdateTensorMap(operand, tensor); + } return tensor; } } // namespace verisilicon_timvx diff --git a/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/converter/expand.cc b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/converter/expand.cc index 110328caba0..435f70050cc 100644 --- a/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/converter/expand.cc +++ b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/converter/expand.cc @@ -32,7 +32,7 @@ int ConvertExpand(Converter* converter, core::Operation* operation) { auto shape_count = output_operand->type.dimensions.count; auto shape_data = output_operand->type.dimensions.data; - std::vector expand_shape; + std::vector expand_shape; for (int i = shape_count - 1; i >= 0; i--) { expand_shape.push_back(shape_data[i]); } diff --git a/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/converter/fill.cc b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/converter/fill.cc index 7aff186761f..1b4a06e544f 100644 --- a/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/converter/fill.cc +++ b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/converter/fill.cc @@ -31,7 +31,7 @@ int ConvertFill(Converter* converter, core::Operation* operation) { auto shape_count = output_operand->type.dimensions.count; auto shape_data = output_operand->type.dimensions.data; - std::vector shape; + std::vector shape; for (int i = shape_count - 1; i >= 0; i--) { shape.push_back(shape_data[i]); } diff --git a/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/dependencies.cmake b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/dependencies.cmake index 092445d576b..a2380e8cd87 100644 --- a/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/dependencies.cmake +++ b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/dependencies.cmake @@ -54,7 +54,7 @@ endif() message(STATUS "NNADAPTER_VERISILICON_TIMVX_VIV_SDK_ROOT=${NNADAPTER_VERISILICON_TIMVX_VIV_SDK_ROOT}") # Remove the -Werror flags to avoid compilation errors -set(VERISILICON_TIMVX_PATCH_COMMAND sed -e "s/-Werror//g" -i CMakeLists.txt && sed -e "s/3.14/3.10/g" -i CMakeLists.txt) +set(VERISILICON_TIMVX_PATCH_COMMAND sed -e "s/-Werror//g" -i CMakeLists.txt && sed -e "s/3.14/3.10/g" -i CMakeLists.txt && sed -e "s/uint /uint32_t /g" -i include/tim/vx/ops/custom_base.h) if(CMAKE_SYSTEM_NAME MATCHES "Android") # Hack the TIM-VX and change the name of lib 'libArchModelSw.so' to 'libarchmodelSw.so' for Android set(VERISILICON_TIMVX_PATCH_COMMAND ${VERISILICON_TIMVX_PATCH_COMMAND} && sed -e "s/libArchModelSw/libarchmodelSw/g" -i cmake/local_sdk.cmake) @@ -78,7 +78,7 @@ ExternalProject_Add( -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DEXTERNAL_VIV_SDK=${NNADAPTER_VERISILICON_TIMVX_VIV_SDK_ROOT} -DCMAKE_INSTALL_PREFIX=${VERISILICON_TIMVX_INSTALL_DIR} - -DTIM_VX_ENABLE_TENSOR_CACHE=OFF + -DTIM_VX_ENABLE_TENSOR_CACHE=OFF ${CROSS_COMPILE_CMAKE_ARGS} ) diff --git a/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/engine.cc b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/engine.cc index d199b3a8d9d..09866e8ca1c 100644 --- a/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/engine.cc +++ b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/engine.cc @@ -19,6 +19,8 @@ #include "driver/verisilicon_timvx/converter/converter.h" #include "driver/verisilicon_timvx/optimizer/convert_fill_like_into_mul_add.h" #include "driver/verisilicon_timvx/optimizer/convert_meshgrid_into_reshape_expand.h" +#include "driver/verisilicon_timvx/optimizer/fix_ops.h" +#include "driver/verisilicon_timvx/optimizer/remove_relu.h" #include "driver/verisilicon_timvx/optimizer/unpack_op_fusion.h" #include "optimizer/constant_fold_operations.h" #include "optimizer/convert_adaptive_pool2d_into_pool2d.h" @@ -95,6 +97,10 @@ int Program::Build(core::Model* model, core::Cache* cache) { for (size_t i = 0; i < input_count; i++) { const auto& type = cache->input_types[i]; input_tensors_[i] = CreateTimVXTensor(graph_.get(), &type); + NNADAPTER_VLOG(3) << "Model input[" << i + << "] id=" << input_tensors_[i]->GetId() + << nnadapter::OperandTypeToString( + &cache->input_types[i]); NNADAPTER_CHECK(input_tensors_[i]); } } @@ -106,6 +112,10 @@ int Program::Build(core::Model* model, core::Cache* cache) { for (size_t i = 0; i < output_count; i++) { const auto& type = cache->output_types[i]; output_tensors_[i] = CreateTimVXTensor(graph_.get(), &type); + NNADAPTER_VLOG(3) << "Model output[" << i + << "] id=" << output_tensors_[i]->GetId() + << nnadapter::OperandTypeToString( + &cache->output_types[i]); NNADAPTER_CHECK(output_tensors_[i]); } auto nbg_op = graph_->CreateOperation( @@ -133,7 +143,9 @@ int Program::Build(core::Model* model, core::Cache* cache) { FuseSigmoidMulIntoSwish(model); ConvertAdaptivePool2dIntoPool2d(model); UnpackOpFusion(model); + // FixOps(model); ConvertQuantizationSymmToAsymm(model); + // RemoveRelu(model); NNADAPTER_VLOG(5) << "Optimized model:" << std::endl << Visualize(model); // Convert a NNAdapter model to a tim-vx graph Converter converter(graph_.get(), &tensors_); @@ -150,6 +162,9 @@ int Program::Build(core::Model* model, core::Cache* cache) { NNADAPTER_CHECK(tensors_.find(operand) != tensors_.end()); input_tensors_[i] = tensors_[operand].front(); NNADAPTER_CHECK(input_tensors_[i]); + NNADAPTER_VLOG(3) << "Model input[" << i + << "] id=" << input_tensors_[i]->GetId() + << nnadapter::OperandTypeToString(&operand->type); input_types_[i] = type; } } @@ -164,6 +179,9 @@ int Program::Build(core::Model* model, core::Cache* cache) { NNADAPTER_CHECK(tensors_.find(operand) != tensors_.end()); output_tensors_[i] = tensors_[operand].back(); NNADAPTER_CHECK(output_tensors_[i]); + NNADAPTER_VLOG(3) << "Model output[" << i + << "] id=" << output_tensors_[i]->GetId() + << nnadapter::OperandTypeToString(&operand->type); output_types_[i] = type; } // Compile tim-vx graph and serialize to NBG(Network Binary Graph) diff --git a/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/optimizer/fix_ops.cc b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/optimizer/fix_ops.cc new file mode 100644 index 00000000000..3b90b5038cc --- /dev/null +++ b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/optimizer/fix_ops.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "driver/verisilicon_timvx/optimizer/fix_ops.h" +#include +#include +#include "utility/debug.h" +#include "utility/logging.h" +#include "utility/modeling.h" +#include "utility/utility.h" + +namespace nnadapter { +namespace verisilicon_timvx { + +static void FixResizeLinearNearest(core::Model* model, + core::Operation* operation) { + auto& input_operands = operation->input_operands; + auto& output_operands = operation->output_operands; + auto output_operand = output_operands[0]; + auto output_operations = GetOperandConsumers(model, output_operand); + auto dummy_operand = + AppendUnaryOperation(model, output_operand, NNADAPTER_RELU); + UpdateOperationInputOperands( + output_operations, output_operand, dummy_operand); + UpdateModelOutputOperands(model, output_operand, dummy_operand); +} + +void FixOps(core::Model* model) { + std::vector operations = + SortOperationsInTopologicalOrder(model); + for (auto operation : operations) { + NNADAPTER_VLOG(5) << "Converting " << OperationTypeToString(operation->type) + << " ..."; + switch (operation->type) { + case NNADAPTER_RESIZE_LINEAR: + case NNADAPTER_RESIZE_NEAREST: + FixResizeLinearNearest(model, operation); + break; + default: + break; + } + } +} + +} // namespace verisilicon_timvx +} // namespace nnadapter diff --git a/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/optimizer/fix_ops.h b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/optimizer/fix_ops.h new file mode 100644 index 00000000000..801a8a12aac --- /dev/null +++ b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/optimizer/fix_ops.h @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "core/types.h" + +namespace nnadapter { +namespace verisilicon_timvx { + +void FixOps(core::Model* model); + +} // namespace verisilicon_timvx +} // namespace nnadapter diff --git a/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/optimizer/remove_relu.cc b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/optimizer/remove_relu.cc new file mode 100644 index 00000000000..147aa11b896 --- /dev/null +++ b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/optimizer/remove_relu.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "driver/verisilicon_timvx/optimizer/remove_relu.h" +#include +#include +#include +#include "utility/debug.h" +#include "utility/logging.h" +#include "utility/micros.h" +#include "utility/modeling.h" +#include "utility/utility.h" + +namespace nnadapter { +namespace verisilicon_timvx { + +// Convert input(scale,zero_point=128)->relu->output to +// input(scale,zero_point=0) +NNADAPTER_EXPORT void RemoveRelu(core::Model* model) { + std::vector operations = + SortOperationsInTopologicalOrder(model); + for (auto operation : operations) { + if (operation->type != NNADAPTER_RELU) continue; + auto relu_input_operand = operation->input_operands[0]; + auto relu_output_operand = operation->output_operands[0]; + if (IsModelInputOperand(relu_input_operand)) continue; + auto relu_input_consumers = GetOperandConsumers(model, relu_input_operand); + if (relu_input_consumers.size() != 1) continue; + if (!IsUInt8AsymmPerLayerQuantType(relu_input_operand->type.precision) || + !IsUInt8AsymmPerLayerQuantType(relu_output_operand->type.precision)) + continue; + relu_input_operand->type.asymm_per_layer_params.scale = + relu_output_operand->type.asymm_per_layer_params.scale; + relu_input_operand->type.asymm_per_layer_params.zero_point = 0; + auto relu_output_consumers = + GetOperandConsumers(model, relu_output_operand); + UpdateOperationInputOperands( + relu_output_consumers, relu_output_operand, relu_input_operand); + if (IsModelOutputOperand(relu_output_operand)) { + UpdateModelOutputOperands(model, relu_output_operand, relu_input_operand); + } + RemoveOperand(model, relu_output_operand); + RemoveOperation(model, operation); + } +} + +} // namespace verisilicon_timvx +} // namespace nnadapter diff --git a/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/optimizer/remove_relu.h b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/optimizer/remove_relu.h new file mode 100644 index 00000000000..a34064cded5 --- /dev/null +++ b/lite/backends/nnadapter/nnadapter/src/driver/verisilicon_timvx/optimizer/remove_relu.h @@ -0,0 +1,25 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "core/types.h" + +namespace nnadapter { +namespace verisilicon_timvx { + +void RemoveRelu(core::Model *model); + +} // namespace verisilicon_timvx +} // namespace nnadapter diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc index 4a70faad063..154d4097f5b 100644 --- a/lite/core/device_info.cc +++ b/lite/core/device_info.cc @@ -562,12 +562,8 @@ int set_sched_affinity(const std::vector& cpu_ids) { #define PD_CPU_ZERO(cpusetp) memset((cpusetp), 0, sizeof(cpu_set_t)) -// set affinity for thread -#ifdef __GLIBC__ + // set affinity for thread pid_t pid = syscall(SYS_gettid); -#else - pid_t pid = gettid(); -#endif cpu_set_t mask; PD_CPU_ZERO(&mask); unsigned int Runmask = 0; diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc index d2a57f25935..79bacc2a1a2 100644 --- a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc +++ b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc @@ -13,15 +13,15 @@ // limitations under the License. #include "lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT #include "lite/core/optimizer/mir/graph_visualize_pass.h" #include "lite/core/optimizer/mir/pass_registry.h" @@ -29,6 +29,8 @@ namespace paddle { namespace lite { namespace mir { +inline float abs(float f) { return f >= 0 ? f : -f; } + bool XPUKernelScoreCmp(const std::pair>& a, const std::pair>& b) { return a.first > b.first; diff --git a/lite/kernels/arm/scale_compute.cc b/lite/kernels/arm/scale_compute.cc index 7f9949676e3..d1b71823e43 100644 --- a/lite/kernels/arm/scale_compute.cc +++ b/lite/kernels/arm/scale_compute.cc @@ -20,6 +20,8 @@ namespace lite { namespace kernels { namespace arm { +inline float abs(float f) { return f ? f >= 0 : -f; } + template void ScaleCompute::Run() { auto& param = this->template Param();