intel · VincyZhang · Jan 10, 2024 · Jan 4, 2024 · Jan 4, 2024 · Jan 4, 2024
diff --git a/.clang-tidy b/.clang-tidy
@@ -0,0 +1,65 @@
+Checks: >
+    -*,
+    # readability-identifier-naming,
+    readability-const-return-type,
+    readability-redundant-smartptr-get,
+    readability-misleading-indentation,
+    readability-redundant-control-flow,
+    readability-redundant-member-init,
+    readability-redundant-string-cstr,
+    readability-redundant-string-init,
+    readability-simplify-subscript-expr,
+    readability-static-accessed-through-instance,
+    readability-static-definition-in-anonymous-namespace,
+    readability-uniqueptr-delete-release,
+    readability-container-size-empty,
+    # readability-delete-null-pointer,  // not applicable for gcc/msvc
+    readability-make-member-function-const,
+    readability-redundant-access-specifiers,
+    performance-for-range-copy,
+    performance-implicit-conversion-in-loop,
+    performance-inefficient-algorithm,
+    performance-inefficient-string-concatenation,
+    # performance-inefficient-vector-operation,
+    performance-move-const-arg,
+    performance-unnecessary-copy-initialization,
+    performance-unnecessary-value-param,
+    performance-no-automatic-move,
+    performance-trivially-destructible,
+    modernize-make-shared,
+    modernize-use-bool-literals,
+    modernize-use-emplace,
+    modernize-use-equals-default,
+    modernize-use-override,
+    modernize-use-nullptr,
+    modernize-use-using,
+    bugprone-assert-side-effect,
+    bugprone-copy-constructor-init,
+    bugprone-forward-declaration-namespace,
+    bugprone-move-forwarding-reference,
+    bugprone-parent-virtual-call,
+    bugprone-too-small-loop-variable,
+    bugprone-undefined-memory-manipulation,
+    bugprone-unhandled-self-assignment,
+    bugprone-multiple-statement-macro,
+    bugprone-macro-parentheses,
+    bugprone-undefined-memory-manipulation,
+    bugprone-unhandled-self-assignment,
+    # google-default-arguments,
+    misc-misplaced-const,
+    misc-definitions-in-headers,
+    misc-redundant-expression,
+    misc-uniqueptr-reset-release,
+    misc-unused-alias-decls,
+    misc-unused-using-decls,
+    cppcoreguidelines-prefer-member-initializer,
+
+CheckOptions:
+  - key:             readability-identifier-naming.ClassCase
+    value:           lower_case
+  - key:             readability-identifier-naming.StructCase
+    value:           lower_case
+  - key:             readability-identifier-naming.ClassSuffix
+    value:           _t
+  - key:             readability-identifier-naming.StructSuffix
+    value:           _t
diff --git a/.github/workflows/format_scan.yml b/.github/workflows/format_scan.yml
@@ -32,7 +32,7 @@ jobs:
             "bandit",
             "clangformat",
             "cloc",
-            "cpplint",
+            "clangtidy",
             # "pydocstyle",
             #"pyspelling",
           ]

diff --git a/.github/workflows/scripts/formatScan/clangformat.sh b/.github/workflows/scripts/formatScan/clangformat.sh
@@ -8,8 +8,8 @@ log_path=${log_dir}/clangformat.log
 cd /neural-speed
 git config --global --add safe.directory "*"
 
-cd /neural-speed/neural_speed
-python scripts/clang-format.py
+cd /neural-speed
+python clang-format.py
 
 echo "run git diff"
 git diff 2>&1 | tee -a ${log_path}

diff --git a/...b/workflows/scripts/formatScan/cpplint.sh → ...workflows/scripts/formatScan/clangtidy.sh b/...b/workflows/scripts/formatScan/cpplint.sh → ...workflows/scripts/formatScan/clangtidy.sh
@@ -2,12 +2,19 @@
 
 source /neural-speed/.github/workflows/scripts/change_color.sh
 
-pip install cpplint
+pip install cmake ninja clang-tidy==16.0.4
 REPO_DIR=/neural-speed
 log_dir=/neural-speed/.github/workflows/scripts/formatScan
-log_path=${log_dir}/cpplint.log
-cpplint --extensions cpp,hpp --filter=-build/include_subdir,-build/header_guard --recursive --quiet --linelength=120 ${REPO_DIR}/neural_speed 2>&1 | tee ${log_path}
-if [[ ! -f ${log_path} ]] || [[ $(grep -c "Total errors found:" ${log_path}) != 0 ]]; then
+log_path=${log_dir}/clangtidy.log
+
+# compile binary
+cd ${REPO_DIR}
+mkdir build
+cd build
+cmake .. -G Ninja -DNS_USE_CLANG_TIDY=CHECK -DBTLA_USE_OPENMP=OFF
+ninja 2>&1 | tee ${log_path}
+
+if [[ ! -f ${log_path} ]] || [[ $(grep -c "warning:" ${log_path}) != 0 ]]; then
     exit 1
 fi
 $BOLD_PURPLE && echo "Congratulations, check passed!" && $LIGHT_PURPLE && echo "You can click on the artifact button to see the log details." && $RESET

diff --git a/.github/workflows/scripts/install_binary.sh b/.github/workflows/scripts/install_binary.sh
@@ -2,7 +2,6 @@
 source /neural-speed/.github/workflows/scripts/change_color.sh
 
 cd /neural-speed
-export CMAKE_ARGS="-DNE_DNNL_CACHE_DIR=/cache"
 $BOLD_YELLOW && echo "---------------- git submodule update --init --recursive -------------" && $RESET
 git config --global --add safe.directory "*"
 git submodule update --init --recursive

diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,8 @@
 .vscode/*
 bestla/build/
 bestla/build/*
+neural_speed.egg-info/
+build/
+debug/
+.eggs/
+dist/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -27,73 +27,68 @@ endif()
 #
 
 # general
-option(NE_STATIC                 "neural_engine: static link libraries"                          OFF)
-option(NE_NATIVE                 "neural_engine: enable -march=native flag"                      OFF)
-option(NE_LTO                    "neural_engine: enable link time optimization"                  OFF)
-option(NE_BUILD_APPLICATIONS     "neural_engine: build applications"                             ON)
+option(NS_STATIC                 "neural_speed: static link libraries"                          OFF)
+option(NS_NATIVE                 "neural_speed: enable -march=native flag"                      OFF)
+option(NS_LTO                    "neural_speed: enable link time optimization"                  OFF)
+option(NS_BUILD_APPLICATIONS     "neural_speed: build applications"                             ON)
 
 # GPU
-option(NE_GPU                    "neural_engine: enable GPU inference"                           OFF)
+option(NS_GPU                    "neural_speed: enable GPU inference"                           OFF)
 
 # debug
-option(NE_ALL_WARNINGS           "neural_engine: enable all compiler warnings"                   ON)
-option(NE_ALL_WARNINGS_3RD_PARTY "neural_engine: enable all compiler warnings in 3rd party libs" OFF)
-option(NE_GPROF                  "neural_engine: enable gprof"                                   OFF)
+option(NS_ALL_WARNINGS           "neural_speed: enable all compiler warnings"                   ON)
+option(NS_ALL_WARNINGS_3RD_PARTY "neural_speed: enable all compiler warnings in 3rd party libs" OFF)
+option(NS_GPROF                  "neural_speed: enable gprof"                                   OFF)
 
 # tensor parallism
-option(NE_TP                     "neural_engine: enable tensor parallism"                        OFF)
-if (NE_TP)
-  add_compile_definitions(NE_TP_MODEL)
+option(NS_TP                     "neural_speed: enable tensor parallism"                        OFF)
+if (NS_TP)
+  add_compile_definitions(NS_TP_MODEL)
 endif()
 
 # sanitizers
-option(NE_SANITIZE_THREAD        "neural_engine: enable thread sanitizer"                        OFF)
-option(NE_SANITIZE_ADDRESS       "neural_engine: enable address sanitizer"                       OFF)
-option(NE_SANITIZE_UNDEFINED     "neural_engine: enable undefined sanitizer"                     OFF)
+option(NS_SANITIZE_THREAD        "neural_speed: enable thread sanitizer"                        OFF)
+option(NS_SANITIZE_ADDRESS       "neural_speed: enable address sanitizer"                       OFF)
+option(NS_SANITIZE_UNDEFINED     "neural_speed: enable undefined sanitizer"                     OFF)
 
 # instruction set specific
-option(NE_AVX                    "neural_engine: enable AVX"                                     ON)
-option(NE_AVX2                   "neural_engine: enable AVX2"                                    ON)
-option(NE_AVX512                 "neural_engine: enable AVX512"                                  OFF)
-option(NE_AVX512_VBMI            "neural_engine: enable AVX512-VBMI"                             OFF)
-option(NE_AVX512_VNNI            "neural_engine: enable AVX512-VNNI"                             OFF)
-option(NE_FMA                    "neural_engine: enable FMA"                                     ON)
-option(NE_AMX                    "neural_engine: enable AMX"                                     OFF)
-option(NE_F16C               "neural_engine: enable F16C"                                        ON)
-
-# 3rd party libs
-option(NE_ONEDNN                 "neural_engine: use oneDNN"                                     ON)
-option(NE_LIBXSMM                "neural_engine: use libxsmm"                                    OFF)
-option(NE_XETLA                  "neural_engine: use XeTLA"                                      OFF)
-if (NE_GPU)
-  set(NE_XETLA ON)
+option(NS_AVX                    "neural_speed: enable AVX"                                     ON)
+option(NS_AVX2                   "neural_speed: enable AVX2"                                    ON)
+option(NS_AVX512                 "neural_speed: enable AVX512"                                  OFF)
+option(NS_AVX512_VBMI            "neural_speed: enable AVX512-VBMI"                             OFF)
+option(NS_AVX512_VNNI            "neural_speed: enable AVX512-VNNI"                             OFF)
+option(NS_FMA                    "neural_speed: enable FMA"                                     ON)
+option(NS_AMX                    "neural_speed: enable AMX"                                     OFF)
+option(NS_F16C                   "neural_speed: enable F16C"                                    ON)
+
+option(NS_BUILD_TESTS            "neural_speed: build tests"                       ${NS_STANDALONE})
+option(NS_BTLA_UT                "enable BesTLA's unit tests"                                   OFF)
+option(NS_BUILD_EXAMPLES         "neural_speed: build examples"                    ${NS_STANDALONE})
+option(NS_USE_CLANG_TIDY         "neural_speed: clang-tidy check"                               OFF)                               
+
+
+if(NS_BUILD_TESTS)
+  add_compile_definitions(NS_BUILD_TESTS)
 endif()
 
-option(NE_BUILD_TESTS            "neural_engine: build tests"    ${NE_STANDALONE})
-option(NE_BTLA_UT                "enable BesTLA's unit tests"    OFF)
-option(NE_BUILD_EXAMPLES         "neural_engine: build examples" ${NE_STANDALONE})
-if(NE_BUILD_TESTS)
-  add_compile_definitions(NE_BUILD_TESTS)
+add_compile_definitions(NS_PERF)
+option(NS_BEAM_SEARCH_VERBOSE    "neural_speed: print beam search processing log"               OFF)
+if (NS_BEAM_SEARCH_VERBOSE)
+    add_compile_definitions(NS_BEAM_SEARCH_VERBOSE_ON)
 endif()
-
-add_compile_definitions(NE_PERF)
-option(NE_BEAM_SEARCH_VERBOSE    "neural_engine: print beam search processing log"               OFF)
-if (NE_BEAM_SEARCH_VERBOSE)
-    add_compile_definitions(NE_BEAM_SEARCH_VERBOSE_ON)
-endif()
-option(NE_GELU_VEC               "neural_engine: enable vec in gelu"                             ON)
-if (NE_GELU_VEC)
-    add_compile_definitions(NE_GELU_USE_VEC)
+option(NS_GELU_VEC               "neural_speed: enable vec in gelu"                             ON)
+if (NS_GELU_VEC)
+    add_compile_definitions(NS_GELU_USE_VEC)
 endif()
-option(NE_PYTHON_API             "neural_engine: use python api"                                 OFF)
-option(NE_SIMD_VEC_DOT_F16       "neural_engine: enable vec_dot_fp16 SIMD optimization"          ON)
+option(NS_PYTHON_API             "neural_speed: use python api"                                 OFF)
+option(NS_SIMD_VEC_DOT_F16       "neural_speed: enable vec_dot_fp16 SIMD optimization"          ON)
 option(BUILD_SHARED_LIBS         "If build as shared libs"                                       ON)
 
-if (NE_SIMD_VEC_DOT_F16)
-    add_compile_definitions(NE_SIMD_VEC_DOT_F16)
+if (NS_SIMD_VEC_DOT_F16)
+    add_compile_definitions(NS_SIMD_VEC_DOT_F16)
 endif()
 
-if(NE_BUILD_TESTS)
+if(NS_BUILD_TESTS)
     enable_testing()
 endif()
 
@@ -104,7 +99,7 @@ if (MSVC)
     endif()
 endif()
 
-if (NE_LTO)
+if (NS_LTO)
     include(CheckIPOSupported)
     check_ipo_supported(RESULT result OUTPUT output)
     if (result)
@@ -115,16 +110,16 @@ if (NE_LTO)
 endif()
 
 if (NOT MSVC)
-    if (NE_STATIC)
+    if (NS_STATIC)
         add_link_options(-static)
         if (MINGW)
             add_link_options(-static-libgcc -static-libstdc++)
         endif()
     endif()
-    if (NE_GPROF)
+    if (NS_GPROF)
         add_compile_options(-pg)
     endif()
-    if (NE_NATIVE)
+    if (NS_NATIVE)
         add_compile_options(-march=native)
     endif()
 endif()
@@ -133,11 +128,11 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 
-if (NE_PYTHON_API)
+if (NS_PYTHON_API)
   add_subdirectory(third_party/pybind11)
 endif()
 
-if (NE_BTLA_UT)
+if (NS_BTLA_UT)
   set(BTLA_UT_ALL ON)
 endif()
 include(FindOpenMP)

diff --git a/CMakePresets.json b/CMakePresets.json
@@ -65,7 +65,7 @@
       "inherits": "x64-debug",
       "cacheVariables": {
         "CMAKE_BUILD_TYPE": "Release",
-        "NE_BTLA_UT": "ON"
+        "NS_BTLA_UT": "ON"
       }
     }
   ]

diff --git a/bestla/CMakeLists.txt b/bestla/CMakeLists.txt
@@ -4,6 +4,8 @@ project(bestla LANGUAGES CXX VERSION 0.1.0)
 file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
 file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)
 
+option(BTLA_USE_OPENMP "Enable OpenMP thread pool" ON)
+
 option(BTLA_UT_ALL "Enable all unit tests" OFF)
 option(BTLA_UT_DEBUG "Enable debug unit tests" OFF)
 option(BTLA_UT_EPILOGUE "Enable unit test for epilogue" OFF)
@@ -41,7 +43,12 @@ endif()
 include(GNUInstallDirs)
 add_library(${PROJECT_NAME} INTERFACE)
 add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
-
+
+if(BTLA_USE_OPENMP)
+message(STATUS "BesTLA using OpenMP")
+target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
+endif(BTLA_USE_OPENMP)
+
 target_include_directories(
 	${PROJECT_NAME} INTERFACE
 	"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"

diff --git a/bestla/bestla/bestla_parallel.h b/bestla/bestla/bestla_parallel.h
@@ -15,7 +15,7 @@
 #include <functional>
 #include <thread>
 #include <vector>
-#ifdef _OPENMP
+#if BTLA_OPENMP
 #include <omp.h>
 #endif
 #include "bestla_utils.h"
@@ -588,14 +588,14 @@ class IThreading {
  public:
   explicit IThreading(int nthreads) : mThreadNum(nthreads) {}
   virtual void parallel_for(const thread_func& func) const = 0;
-  virtual inline void sync() const = 0;
+  virtual inline void sync() const { assert(0); };
   virtual int num_threads() const { return mThreadNum; };
   virtual void set_threads(int nthreads) = 0;
 
  protected:
   int mThreadNum;
 };
-#ifdef _OPENMP
+#if BTLA_OPENMP
 class OMPThreading : public IThreading {
  public:
   explicit OMPThreading(int nthreads) : IThreading(nthreads) { omp_set_num_threads(nthreads); }

diff --git a/bestla/bestla/bestla_prologue_b.h b/bestla/bestla/bestla_prologue_b.h
@@ -382,8 +382,7 @@ class WeightKBlockNInteger {
   void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales,
                    const int8_t* zero_points, StorageWeight* stor, parallel::IThreading* threading) {
     setQuantCorrection(N, K, zero_points, scales, stor, threading);
-    if (stor->mDType == BTLA_DTYPE::S8 || stor->mDType == BTLA_DTYPE::F8_E4M3 ||
-        stor->mDType == BTLA_DTYPE::F8_E5M2) {
+    if (stor->mDType == BTLA_DTYPE::S8 || stor->mDType == BTLA_DTYPE::F8_E4M3 || stor->mDType == BTLA_DTYPE::F8_E5M2) {
       reorderWeight(N, K, B, ldb, stor->WPtr<int8_t>(), threading);
     } else {
       auto reorded = utils::amalloc<int8_t>((size_t)stor->mKPad * stor->mNPad);
@@ -774,8 +773,8 @@ class WeightKBlockNInteger {
     auto ptr = reinterpret_cast<StorageWeight*>(stor);
     auto quant_dtype = ptr->mDType;
     if (quant_dtype == BTLA_DTYPE::S8) {
-      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, BTLA_DTYPE::S8>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, ptr->mBlockSize);
+      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, BTLA_DTYPE::S8>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                               scales, zero_points, ptr->mBlockSize);
     } else if (quant_dtype == BTLA_DTYPE::S4_FULLRANGE) {
       kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, BTLA_DTYPE::S4_FULLRANGE>(
           srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, ptr->mBlockSize);
@@ -953,20 +952,20 @@ class WeightKBlockNFloat : public WeightKBlockNInteger<_GemmCore_T, ISA_T> {
     auto ptr = reinterpret_cast<StorageWeight*>(stor);
     auto quant_dtype = ptr->mDType;
     if (quant_dtype == BTLA_DTYPE::F8_E4M3) {
-      kernel::wrapper::QuantizeF8RowBlock::forward<ISA_T, BTLA_DTYPE::F8_E4M3>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, ptr->mBlockSize, ptr->SDtype());
+      kernel::wrapper::QuantizeF8RowBlock::forward<ISA_T, BTLA_DTYPE::F8_E4M3>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                               scales, ptr->mBlockSize, ptr->SDtype());
     } else if (quant_dtype == BTLA_DTYPE::F8_E5M2) {
-      kernel::wrapper::QuantizeF8RowBlock::forward<ISA_T, BTLA_DTYPE::F8_E5M2>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, ptr->mBlockSize, ptr->SDtype());
+      kernel::wrapper::QuantizeF8RowBlock::forward<ISA_T, BTLA_DTYPE::F8_E5M2>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                               scales, ptr->mBlockSize, ptr->SDtype());
     } else if (quant_dtype == BTLA_DTYPE::F4_BNB) {
       kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, BTLA_DTYPE::F4_BNB>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                               scales, zero_points, ptr->mBlockSize);
+                                                                              scales, zero_points, ptr->mBlockSize);
     } else if (quant_dtype == BTLA_DTYPE::F4_E2M1) {
-      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, BTLA_DTYPE::F4_E2M1>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, ptr->mBlockSize);
+      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, BTLA_DTYPE::F4_E2M1>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                               scales, zero_points, ptr->mBlockSize);
     } else if (quant_dtype == BTLA_DTYPE::F4_NF4) {
       kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, BTLA_DTYPE::F4_NF4>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                               scales, zero_points, ptr->mBlockSize);
+                                                                              scales, zero_points, ptr->mBlockSize);
     } else {
       assert(0);
     }