diff --git a/.bazeliskrc b/.bazeliskrc
new file mode 100644
index 0000000..f4c1884
--- /dev/null
+++ b/.bazeliskrc
@@ -0,0 +1 @@
+USE_BAZEL_VERSION=7.4.1
\ No newline at end of file
diff --git a/.bazelrc b/.bazelrc
index a73f8c7..ec2e009 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -15,6 +15,10 @@
 common --experimental_repo_remote_exec
 common --experimental_cc_shared_library
 
+common --registry=https://raw.githubusercontent.com/secretflow/bazel-registry/main
+common --registry=https://bcr.bazel.build
+common --registry=https://baidu.github.io/babylon/registry
+
 build --incompatible_new_actions_api=false
 build --copt=-fdiagnostics-color=always
 build --enable_platform_specific_config
diff --git a/.bazelversion b/.bazelversion
deleted file mode 100644
index f22d756..0000000
--- a/.bazelversion
+++ /dev/null
@@ -1 +0,0 @@
-6.5.0
diff --git a/.clang-tidy b/.clang-tidy
index a82670a..e422034 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -76,3 +76,4 @@ CheckOptions:
 
   - key:   performance-unnecessary-value-param.AllowedTypes
     value: PtBufferView
+
diff --git a/.gitignore b/.gitignore
index 0f9ce30..248050f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,7 @@ Pipfile
 
 # bazel
 bazel-*
+MODULE.bazel.lock
 
 # cmake related
 abseil-cpp
@@ -44,4 +45,4 @@ rpc_data
 coverity*/
 idir/
 
-ossutil_output/
\ No newline at end of file
+ossutil_output/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..02d6def
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,116 @@
+# Contribution guidelines
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project.
+
+## Style
+
+### C++ coding style
+
+In general, please use clang-format to format code, and follow clang-tidy tips.
+
+Most of the code style is derived from the
+[Google C++ style guidelines](https://google.github.io/styleguide/cppguide.html), except:
+
+- Exceptions are allowed and encouraged where appropriate.
+- Header guards should use `#pragma once`.
+- Adopt [camelBack](https://llvm.org/docs/Proposals/VariableNames.html#variable-names-coding-standard-options)
+    for function names.
+- Use [fixed width integer types](https://en.cppreference.com/w/cpp/types/integer) whenever possible.
+- Avoid using size_t on interface APIs.
+
+The compiler portion of the project follows [MLIR style](https://mlir.llvm.org/getting_started/DeveloperGuide/#style-guide).
+
+### Other tips
+
+- Git commit message should be meaningful, we suggest imperative [keywords](https://github.com/joelparkerhenderson/git_commit_message#summary-keywords).
+- Developer must write unit-test (line coverage must be greater than 80%), tests should be deterministic.
+- Read awesome [Abseil Tips](https://abseil.io/tips/)
+
+## Build
+
+### Prerequisite
+
+
+#### Docker
+
+```sh
+## start container
+docker run -d -it --name psi-dev-$(whoami) \
+         --mount type=bind,source="$(pwd)",target=/home/admin/dev/ \
+         -w /home/admin/dev \
+         --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
+         --cap-add=NET_ADMIN \
+         --privileged=true \
+         secretflow/ubuntu-base-ci:latest \
+         bash
+
+# attach to build container
+docker exec -it psi-dev-$(whoami) bash
+```
+
+#### Linux
+
+```sh
+Install gcc>=11.2, cmake>=3.26, ninja, nasm>=2.15, python>=3.10, bazelisk, xxd, lld
+```
+
+#### macOS
+
+```sh
+# macOS >= 13.0, Xcode >= 15.0
+
+# Install Xcode
+https://apps.apple.com/us/app/xcode/id497799835?mt=12
+
+# Select Xcode toolchain version
+sudo xcode-select -s /Applications/Xcode.app/Contents/Developer
+
+# Install homebrew
+https://brew.sh/
+
+# Install dependencies
+# Be aware, brew may install a newer version of bazel, when that happens bazel will give an error message during build.
+# Please follow instructions in the error message to install the required version
+brew install bazelisk cmake ninja libomp wget
+
+# For Intel mac only
+brew install nasm
+```
+
+### Build & UnitTest
+
+
+
+
+``` sh
+# build as debug
+bazel build //... -c dbg
+
+# build as release
+bazel build //... -c opt
+
+# test
+bazel test //...
+
+# [optional] build & test with ASAN or UBSAN, for macOS users please use configs with macOS prefix
+bazel test //... --features=asan
+bazel test //... --features=ubsan
+```
+
+### Bazel build options
+
+- `--define gperf=on` enable gperf
+
+### Build docs
+
+```sh
+# prerequisite
+pip install -U -r docs/requirements.txt
+
+cd docs && make html  # html docs will be in docs/_build/html
+```
diff --git a/MODULE.bazel b/MODULE.bazel
new file mode 100644
index 0000000..0cea1dd
--- /dev/null
+++ b/MODULE.bazel
@@ -0,0 +1,83 @@
+###############################################################################
+# Bazel now uses Bzlmod by default to manage external dependencies.
+# Please consider migrating your external dependencies from WORKSPACE to MODULE.bazel.
+#
+# For more details, please check https://github.com/bazelbuild/bazel/issues/18958
+###############################################################################
+
+module(
+    name = "psi",
+    version = "0.6.0.dev241212",
+    compatibility_level = 1,
+)
+
+bazel_dep(name = "yacl", version = "20241212.0-871832a")
+
+single_version_override(
+    module_name = "grpc",
+    patch_strip = 1,
+    patches = [
+        "//bazel/patches:grpc-1.66.patch",
+        "//bazel/patches:grpc-module-file.patch",
+    ],
+    version = "1.66.0.bcr.3",
+)
+
+bazel_dep(name = "platforms", version = "0.0.8")
+bazel_dep(name = "apple_support", version = "1.17.1")
+bazel_dep(name = "rules_cc", version = "0.0.13")
+bazel_dep(name = "rules_proto", version = "6.0.0.bcr.1")
+bazel_dep(name = "rules_foreign_cc", version = "0.12.0")
+bazel_dep(name = "protobuf", version = "27.3")
+bazel_dep(name = "spdlog", version = "1.14.1")
+bazel_dep(name = "fmt", version = "11.0.2")
+bazel_dep(name = "abseil-cpp", version = "20240722.0")
+bazel_dep(name = "gflags", version = "2.2.2")
+bazel_dep(name = "rapidjson", version = "1.1.0.bcr.20241007")
+bazel_dep(name = "boost.math", version = "1.83.0")
+bazel_dep(name = "boost.uuid", version = "1.83.0")
+bazel_dep(name = "boost.algorithm", version = "1.83.0.bcr.1")
+bazel_dep(name = "boost.multiprecision", version = "1.83.0")
+bazel_dep(name = "zlib", version = "1.3.1.bcr.3")
+
+# --registry=https://baidu.github.io/babylon/registry
+bazel_dep(name = "openssl", version = "3.3.2")
+
+# self-host registry
+bazel_dep(name = "org_interconnection", version = "0.0.1")
+bazel_dep(name = "fourqlib", version = "0.0.0-20220901-1031567")
+bazel_dep(name = "arrow", version = "10.0.0", repo_name = "org_apache_arrow")
+bazel_dep(name = "ippcp", version = "2021.8")
+bazel_dep(name = "libdivide", version = "5.0")
+bazel_dep(name = "emp-tool", version = "0.2.5")
+bazel_dep(name = "sparsehash", version = "2.0.4")
+bazel_dep(name = "sse2neon", version = "1.7.0-20240330-8df2f48")
+
+# non mododule dependencies
+non_module_dependencies = use_extension("//bazel:defs.bzl", "non_module_dependencies")
+use_repo(
+    non_module_dependencies,
+    "apsi",
+    "curve25519-donna",
+    "kuku",
+    "perfetto",
+    "seal",
+    "zstd",
+)
+
+new_local_repository = use_repo_rule("@bazel_tools//tools/build_defs/repo:local.bzl", "new_local_repository")
+new_local_repository(
+    name = "macos_omp_x64",
+    build_file = "@yacl//bazel:local_openmp_macos.BUILD",
+    path = "/usr/local/opt/libomp",
+)
+
+new_local_repository(
+    name = "macos_omp_arm64",
+    build_file = "@yacl//bazel:local_openmp_macos.BUILD",
+    path = "/opt/homebrew/opt/libomp/",
+)
+
+# test
+bazel_dep(name = "googletest", version = "1.15.2", dev_dependency = True, repo_name = "com_google_googletest")
+bazel_dep(name = "google_benchmark", version = "1.8.5", dev_dependency = True, repo_name = "com_github_google_benchmark")
diff --git a/README.md b/README.md
index b2cc375..d09ca45 100644
--- a/README.md
+++ b/README.md
@@ -122,7 +122,7 @@ In the first terminal, run the following command
 docker run -it  --rm  --network host --mount type=bind,source=/tmp/receiver,target=/root/receiver --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=NET_ADMIN --privileged=true secretflow-registry.cn-hangzhou.cr.aliyuncs.com/secretflow/psi-anolis8:latest --config receiver/receiver.config
 ```
 
-In the other terminal, run the following command simultaneously.
+In the other terminal, run the following command simultaneously. 
 
 ```bash
 docker run -it  --rm  --network host --mount type=bind,source=/tmp/sender,target=/root/sender  --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=NET_ADMIN --privileged=true secretflow-registry.cn-hangzhou.cr.aliyuncs.com/secretflow/psi-anolis8:latest --config sender/sender.config
@@ -166,7 +166,7 @@ Install gcc>=11.2, cmake>=3.26, ninja, nasm>=2.15, python>=3.8, bazel, golang, x
 ```
 
 > **Note**<br>
-Please install bazel with version in .bazelversion or use bazelisk.
+Please install bazel with version in .bazeliskrc or use bazelisk.
 
 ### Build & UnitTest
 
@@ -213,3 +213,4 @@ Please refer to [PSI V2 Benchmark](docs/user_guide/psi_v2_benchmark.md)
 ## APSI Benchmark
 
 Please refer to [APSI Benchmark](docs/user_guide/apsi_benchmark.md)
+
diff --git a/RELEASE.md b/RELEASE.md
index d20f8ba..c8327d1 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -40,7 +40,7 @@
 ## v0.3.0beta
 - [Improvement] add uuid in system temp folder.
 - [Improvement] use arrow csv reader in pir.
-- [Bugfix] fix typo in psi config check.
+- [Bugfix] fix typo in psi config check. 
 
 ## v0.3.0.dev240304
 - [API] expose ic_mode in RunLegacyPsi api
diff --git a/WORKSPACE b/WORKSPACE
deleted file mode 100644
index 3771892..0000000
--- a/WORKSPACE
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2021 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-workspace(name = "psi")
-
-load("//bazel:repositories.bzl", "psi_deps")
-
-psi_deps()
-
-#
-# yacl
-# Warning: psi relies on yacl to bring in common 3p libraries.
-#          Please make sure yacl_deps are called right after psi_deps.
-#
-load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
-load("@yacl//bazel:repositories.bzl", "yacl_deps")
-
-yacl_deps()
-
-load("@rules_python//python:repositories.bzl", "py_repositories")
-
-py_repositories()
-
-load(
-    "@rules_foreign_cc//foreign_cc:repositories.bzl",
-    "rules_foreign_cc_dependencies",
-)
-
-rules_foreign_cc_dependencies(
-    register_built_tools = False,
-    register_default_tools = False,
-    register_preinstalled_tools = True,
-)
-
-load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
-
-grpc_deps()
-
-# Not mentioned in official docs... mentioned here https://github.com/grpc/grpc/issues/20511
-load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
-
-grpc_extra_deps()
-
-#
-# boost
-#
-load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps")
-
-boost_deps()
diff --git a/bazel/arrow.BUILD b/bazel/arrow.BUILD
deleted file mode 100644
index c8fd52c..0000000
--- a/bazel/arrow.BUILD
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright 2023 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# copied from https://github.com/tensorflow/io/blob/master/third_party/arrow.BUILD and made some changes
-# Description:
-#   Apache Arrow library
-
-load("@com_github_grpc_grpc//bazel:cc_grpc_library.bzl", "cc_grpc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE.txt"])
-
-genrule(
-    name = "arrow_util_config",
-    srcs = ["cpp/src/arrow/util/config.h.cmake"],
-    outs = ["cpp/src/arrow/util/config.h"],
-    cmd = ("sed " +
-           "-e 's/@ARROW_VERSION_MAJOR@/9/g' " +
-           "-e 's/@ARROW_VERSION_MINOR@/0/g' " +
-           "-e 's/@ARROW_VERSION_PATCH@/0/g' " +
-           "-e 's/cmakedefine ARROW_USE_NATIVE_INT128/undef ARROW_USE_NATIVE_INT128/g' " +
-           "-e 's/cmakedefine ARROW_WITH_OPENTELEMETRY/undef ARROW_WITH_OPENTELEMETRY/g' " +
-           "-e 's/cmakedefine ARROW_GCS/undef ARROW_GCS/g' " +
-           "-e 's/cmakedefine ARROW_S3/undef ARROW_S3/g' " +
-           "-e 's/cmakedefine ARROW_JEMALLOC/undef ARROW_JEMALLOC/g' " +
-           "-e 's/cmakedefine ARROW_JEMALLOC_VENDORED/undef ARROW_JEMALLOC_VENDORED/g' " +
-           "-e 's/cmakedefine/define/g' " +
-           "$< >$@"),
-)
-
-genrule(
-    name = "parquet_version_h",
-    srcs = ["cpp/src/parquet/parquet_version.h.in"],
-    outs = ["cpp/src/parquet/parquet_version.h"],
-    cmd = ("sed " +
-           "-e 's/@PARQUET_VERSION_MAJOR@/1/g' " +
-           "-e 's/@PARQUET_VERSION_MINOR@/5/g' " +
-           "-e 's/@PARQUET_VERSION_PATCH@/1/g' " +
-           "$< >$@"),
-)
-
-cc_library(
-    name = "arrow_vendored",
-    srcs = glob([
-        "cpp/src/arrow/vendored/datetime/*.h",
-        "cpp/src/arrow/vendored/datetime/*.cpp",
-        "cpp/src/arrow/vendored/pcg/pcg_uint128.hpp",
-        "cpp/src/arrow/vendored/pcg/pcg_random.hpp",
-        "cpp/src/arrow/vendored/pcg/pcg_extras.hpp",
-        "cpp/src/arrow/vendored/uriparser/*.h",
-        "cpp/src/arrow/vendored/uriparser/*.c",
-    ]),
-    includes = [
-        "cpp/src",
-    ],
-    visibility = ["//visibility:private"],
-)
-
-cc_library(
-    name = "arrow",
-    srcs = glob(
-        [
-            "cpp/src/arrow/*.cc",
-            "cpp/src/arrow/c/*.cc",
-            "cpp/src/arrow/array/*.cc",
-            "cpp/src/arrow/csv/*.cc",
-            "cpp/src/arrow/extension/**/*.cc",
-            "cpp/src/arrow/extension/**/*.h",
-            "cpp/src/arrow/io/*.cc",
-            "cpp/src/arrow/ipc/*.cc",
-            "cpp/src/arrow/json/*.cc",
-            "cpp/src/arrow/tensor/*.cc",
-            "cpp/src/arrow/compute/**/*.cc",
-            "cpp/src/arrow/util/*.cc",
-            "cpp/src/arrow/vendored/optional.hpp",
-            "cpp/src/arrow/vendored/string_view.hpp",
-            "cpp/src/arrow/vendored/variant.hpp",
-            "cpp/src/arrow/vendored/base64.cpp",
-            "cpp/src/arrow/**/*.h",
-            "cpp/src/parquet/**/*.h",
-            "cpp/src/parquet/**/*.cc",
-            "cpp/src/generated/*.h",
-            "cpp/src/generated/*.cpp",
-            "cpp/thirdparty/flatbuffers/include/flatbuffers/*.h",
-        ],
-        exclude = [
-            "cpp/src/**/*_benchmark.cc",
-            "cpp/src/**/*_main.cc",
-            "cpp/src/**/*_nossl.cc",
-            "cpp/src/**/*_test.cc",
-            "cpp/src/**/test_*.h",
-            "cpp/src/**/test_*.cc",
-            "cpp/src/**/benchmark_util.h",
-            "cpp/src/**/benchmark_util.cc",
-            "cpp/src/**/*hdfs*.cc",
-            "cpp/src/**/*fuzz*.cc",
-            "cpp/src/arrow/memory_pool_jemalloc.cc",
-            "cpp/src/**/file_to_stream.cc",
-            "cpp/src/**/stream_to_file.cc",
-            "cpp/src/arrow/dataset/file_orc*",
-            "cpp/src/arrow/filesystem/gcsfs*.cc",
-            "cpp/src/arrow/filesystem/s3*.cc",
-            "cpp/src/arrow/filesystem/*_test_util.cc",
-            "cpp/src/arrow/util/bpacking_avx2.cc",
-            "cpp/src/arrow/util/bpacking_avx512.cc",
-            "cpp/src/arrow/util/bpacking_neon.cc",
-            "cpp/src/arrow/util/tracing_internal.cc",
-            "cpp/src/arrow/compute/**/*_avx2.cc",
-        ],
-    ),
-    hdrs = [
-        # declare header from above genrule
-        "cpp/src/arrow/util/config.h",
-        "cpp/src/parquet/parquet_version.h",
-    ],
-    copts = [],
-    defines = [
-        "ARROW_WITH_BROTLI",
-        "ARROW_WITH_SNAPPY",
-        "ARROW_WITH_LZ4",
-        "ARROW_WITH_ZLIB",
-        "ARROW_WITH_ZSTD",
-        "ARROW_WITH_BZ2",
-        "ARROW_STATIC",
-        "ARROW_EXPORT=",
-        "PARQUET_STATIC",
-        "PARQUET_EXPORT=",
-    ],
-    includes = [
-        "cpp/src",
-        "cpp/src/arrow/vendored/xxhash",
-        "cpp/thirdparty/flatbuffers/include",
-    ],
-    linkopts = ["-lpthread"],
-    textual_hdrs = [
-        "cpp/src/arrow/vendored/xxhash/xxhash.c",
-    ],
-    deps = [
-        ":arrow_vendored",
-        "@boost//:multiprecision",
-        "@brotli",
-        "@bzip2",
-        "@com_github_facebook_zstd//:zstd",
-        "@com_github_gflags_gflags//:gflags",
-        "@com_github_google_snappy//:snappy",
-        "@com_github_grpc_grpc//:grpc++",
-        "@com_github_grpc_grpc//:grpc++_reflection",
-        "@com_github_lz4_lz4//:lz4",
-        "@com_github_tencent_rapidjson//:rapidjson",
-        "@com_github_xtensor_xsimd//:xsimd",
-        "@com_google_double_conversion//:double-conversion",
-        "@org_apache_thrift//:thrift",
-        "@zlib",
-    ],
-)
diff --git a/bazel/brotli.BUILD b/bazel/brotli.BUILD
deleted file mode 100644
index c586412..0000000
--- a/bazel/brotli.BUILD
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright 2023 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# copied from https://github.com/tensorflow/io/blob/v0.25.0/third_party/brotli.BUILD
-# Description:
-#   Brotli library
-
-licenses(["notice"])  # MIT license
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "brotli",
-    srcs = glob([
-        "c/common/*.c",
-        "c/common/*.h",
-        "c/dec/*.c",
-        "c/dec/*.h",
-        "c/enc/*.c",
-        "c/enc/*.h",
-        "c/include/brotli/*.h",
-    ]),
-    hdrs = [],
-    defines = [],
-    includes = [
-        "c/dec",
-        "c/include",
-    ],
-    linkopts = [],
-    visibility = ["//visibility:public"],
-)
diff --git a/bazel/bzip2.BUILD b/bazel/bzip2.BUILD
deleted file mode 100644
index fc618d3..0000000
--- a/bazel/bzip2.BUILD
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2023 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# copied from https://github.com/tensorflow/io/blob/v0.25.0/third_party/bzip2.BUILD
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # BSD-like license
-
-cc_library(
-    name = "bzip2",
-    srcs = [
-        "blocksort.c",
-        "bzlib.c",
-        "bzlib_private.h",
-        "compress.c",
-        "crctable.c",
-        "decompress.c",
-        "huffman.c",
-        "randtable.c",
-    ],
-    hdrs = [
-        "bzlib.h",
-    ],
-    copts = [
-    ],
-    includes = ["."],
-)
diff --git a/bazel/rapidjson.BUILD b/bazel/defs.bzl
similarity index 59%
rename from bazel/rapidjson.BUILD
rename to bazel/defs.bzl
index 86748d0..a758117 100644
--- a/bazel/rapidjson.BUILD
+++ b/bazel/defs.bzl
@@ -1,4 +1,4 @@
-# Copyright 2023 Ant Group Co., Ltd.
+# Copyright 2024 Ant Group Co., Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,20 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+load("//bazel:repositories.bzl", "psi_deps")
 
-# copied from https://github.com/tensorflow/io/blob/v0.25.0/third_party/rapidjson.BUILD
+def _non_module_deps_impl(_module_ctx):
+    psi_deps()
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # MIT/JSON license
-
-cc_library(
-    name = "rapidjson",
-    srcs = glob([
-        "include/**/*.h",
-    ]),
-    copts = [],
-    includes = [
-        "include",
-    ],
+non_module_dependencies = module_extension(
+    implementation = _non_module_deps_impl,
 )
diff --git a/bazel/emp-tool.BUILD b/bazel/emp-tool.BUILD
deleted file mode 100644
index 57a2c77..0000000
--- a/bazel/emp-tool.BUILD
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2022 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@yacl//bazel:yacl.bzl", "yacl_cmake_external")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "all_srcs",
-    srcs = glob(["**"]),
-)
-
-yacl_cmake_external(
-    name = "emp-tool",
-    cache_entries = {
-        "OPENSSL_ROOT_DIR": "$EXT_BUILD_DEPS/openssl",
-        "BUILD_TESTING": "OFF",
-    },
-    lib_source = ":all_srcs",
-    out_data_dirs = ["cmake"],
-    out_static_libs = [
-        "libemp-tool.a",
-    ],
-    deps = [
-        "@com_github_openssl_openssl//:openssl",
-    ],
-)
diff --git a/bazel/gperftools.BUILD b/bazel/gperftools.BUILD
index b4314dd..4ace425 100644
--- a/bazel/gperftools.BUILD
+++ b/bazel/gperftools.BUILD
@@ -1,5 +1,4 @@
 load("@rules_foreign_cc//foreign_cc:defs.bzl", "configure_make")
-load("@rules_cc//cc:defs.bzl", "cc_library")
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/bazel/hexl.BUILD b/bazel/hexl.BUILD
deleted file mode 100644
index 5fbe1e1..0000000
--- a/bazel/hexl.BUILD
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2022 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "all_srcs",
-    srcs = glob(["**"]),
-)
-
-cmake(
-    name = "hexl",
-    cache_entries = {
-        "CMAKE_BUILD_TYPE": "Release",
-        "CpuFeatures_DIR": "$EXT_BUILD_DEPS/cpu_features/lib/cmake/CpuFeatures/",
-        "HEXL_BENCHMARK": "OFF",
-        "HEXL_TESTING": "OFF",
-        "CMAKE_INSTALL_LIBDIR": "lib",
-    },
-    generate_args = ["-GNinja"],
-    lib_source = ":all_srcs",
-    out_data_dirs = ["lib/cmake"],
-    out_static_libs = ["libhexl.a"],
-    deps = [
-        "@com_github_google_cpu_features//:cpu_features",
-    ],
-)
diff --git a/bazel/ipp.BUILD b/bazel/ipp.BUILD
deleted file mode 100644
index 56e308f..0000000
--- a/bazel/ipp.BUILD
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright 2022 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@yacl//bazel:yacl.bzl", "yacl_cmake_external")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "all_srcs",
-    srcs = glob(["**"]),
-)
-
-yacl_cmake_external(
-    name = "ipp",
-    cache_entries = {
-        "ARCH": "intel64",
-        "OPENSSL_INCLUDE_DIR": "$EXT_BUILD_DEPS/openssl/include",
-        "OPENSSL_LIBRARIES": "$EXT_BUILD_DEPS/openssl/lib",
-        "OPENSSL_ROOT_DIR": "$EXT_BUILD_DEPS/openssl",
-        "CMAKE_BUILD_TYPE": "Release",
-    },
-    lib_source = ":all_srcs",
-    out_static_libs = [
-        "intel64/libippcp.a",
-        "intel64/libcrypto_mb.a",
-    ],
-    deps = [
-        "@com_github_openssl_openssl//:openssl",
-    ],
-)
diff --git a/bazel/jsoncpp.BUILD b/bazel/jsoncpp.BUILD
index 53134c0..0c096e8 100644
--- a/bazel/jsoncpp.BUILD
+++ b/bazel/jsoncpp.BUILD
@@ -30,10 +30,10 @@ cmake(
         "BUILD_OBJECT_LIBS": "OFF",
         "CMAKE_INSTALL_LIBDIR": "lib",
     },
-    generate_args = ["-GNinja"],
     env = {
         "CCACHE_DISABLE": "1",
     },
+    generate_args = ["-GNinja"],
     lib_source = "@com_github_open_source_parsers_jsoncpp//:all",
     out_static_libs = ["libjsoncpp.a"],
 )
diff --git a/bazel/libdivide.BUILD b/bazel/libdivide.BUILD
deleted file mode 100644
index c044063..0000000
--- a/bazel/libdivide.BUILD
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2022 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "all_srcs",
-    srcs = glob(["**"]),
-)
-
-cmake(
-    name = "libdivide",
-    cache_entries = {
-        "BUILD_TESTS": "OFF",
-    },
-    generate_args = ["-GNinja"],
-    lib_source = ":all_srcs",
-    out_headers_only = True,
-    out_include_dir = "include",
-)
diff --git a/bazel/lz4.BUILD b/bazel/lz4.BUILD
deleted file mode 100644
index 80f3e37..0000000
--- a/bazel/lz4.BUILD
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2023 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "all_srcs",
-    srcs = glob(["**"]),
-)
-
-cmake(
-    name = "lz4",
-    cache_entries = {
-        "LZ4_BUILD_CLI": "OFF",
-        "BUILD_SHARED_LIBS": "OFF",
-        "BUILD_STATIC_LIBS": "ON",
-        "CMAKE_INSTALL_LIBDIR": "lib",
-    },
-    generate_args = ["-GNinja"],
-    lib_source = ":all_srcs",
-    out_static_libs = [
-        "liblz4.a",
-    ],
-    working_directory = "build/cmake",
-)
diff --git a/bazel/microsoft_apsi.BUILD b/bazel/microsoft_apsi.BUILD
index 4dcebc9..02fd14f 100644
--- a/bazel/microsoft_apsi.BUILD
+++ b/bazel/microsoft_apsi.BUILD
@@ -30,19 +30,19 @@ cmake(
         "EXT_BUILD_DEPS": "$EXT_BUILD_DEPS",
     },
     generate_args = ["-GNinja"],
-    lib_source = "@com_github_microsoft_apsi//:all",
+    lib_source = "@apsi//:all",
     out_include_dir = "include/APSI-0.11",
     out_static_libs = ["libapsi-0.11.a"],
     deps = [
-        "@com_github_facebook_zstd//:zstd",
         "@com_github_log4cplus_log4cplus//:log4cplus",
-        "@com_github_microsoft_FourQlib//:FourQlib",
         "@com_github_microsoft_gsl//:Microsoft.GSL",
-        "@com_github_microsoft_kuku//:kuku",
-        "@com_github_microsoft_seal//:seal",
         "@com_github_open_source_parsers_jsoncpp//:jsoncpp",
         "@com_github_zeromq_cppzmq//:cppzmq",
         "@com_google_flatbuffers//:FlatBuffers",
+        "@fourqlib//:FourQlib",
+        "@kuku",
+        "@seal",
         "@zlib",
+        "@zstd",
     ],
 )
diff --git a/bazel/microsoft_kuku.BUILD b/bazel/microsoft_kuku.BUILD
index 3c4c5a1..1a8f930 100644
--- a/bazel/microsoft_kuku.BUILD
+++ b/bazel/microsoft_kuku.BUILD
@@ -30,7 +30,7 @@ cmake(
         "CMAKE_INSTALL_LIBDIR": "lib",
     },
     generate_args = ["-GNinja"],
-    lib_source = "@com_github_microsoft_kuku//:all",
+    lib_source = "@kuku//:all",
     out_include_dir = "include/Kuku-2.1",
     out_static_libs = ["libkuku-2.1.a"],
     deps = ["@com_github_microsoft_gsl//:Microsoft.GSL"],
diff --git a/bazel/patches/apsi.patch b/bazel/patches/apsi.patch
index 20cb990..176c6c6 100644
--- a/bazel/patches/apsi.patch
+++ b/bazel/patches/apsi.patch
@@ -22,11 +22,11 @@ index e683045..067d244 100644
 @@ -30,7 +30,7 @@ namespace apsi {
                  return item_count_;
              }
-
+ 
 -        private:
 +            // private:
              IndexTranslationTable() = default;
-
+ 
              std::unordered_map<std::size_t, std::size_t> table_idx_to_item_idx_;
 diff --git a/receiver/apsi/CMakeLists.txt b/receiver/apsi/CMakeLists.txt
 index afce298..1790b30 100644
@@ -45,12 +45,12 @@ index 850ac47..aef52a4 100644
 --- a/common/apsi/network/sender_operation.cpp
 +++ b/common/apsi/network/sender_operation.cpp
 @@ -135,7 +135,7 @@ namespace apsi {
-
+ 
              auto oprf_data = fbs_builder.CreateVector(
                  reinterpret_cast<const uint8_t *>(data.data()), data.size());
 -            auto req = fbs::CreateOPRFRequest(fbs_builder, oprf_data);
 +            auto req = fbs::CreateOPRFRequest(fbs_builder, oprf_data, bucket_idx);
-
+ 
              fbs::SenderOperationBuilder sop_builder(fbs_builder);
              sop_builder.add_request_type(fbs::Request_OPRFRequest);
 @@ -180,6 +180,7 @@ namespace apsi {
@@ -58,7 +58,7 @@ index 850ac47..aef52a4 100644
              data.resize(oprf_data.size());
              copy_bytes(oprf_data.data(), oprf_data.size(), data.data());
 +            bucket_idx = sop->request_as_OPRFRequest()->bucket_idx();
-
+ 
              return in_data.size();
          }
 @@ -231,7 +232,8 @@ namespace apsi {
@@ -68,13 +68,13 @@ index 850ac47..aef52a4 100644
 -                query_request_parts);
 +                query_request_parts,
 +                bucket_idx);
-
+ 
              fbs::SenderOperationBuilder sop_builder(fbs_builder);
              sop_builder.add_request_type(fbs::Request_QueryRequest);
 @@ -346,6 +348,8 @@ namespace apsi {
                  data.emplace(exponent, move(cts_vec));
              }
-
+ 
 +            bucket_idx = req.bucket_idx();
 +
              return in_data.size();
@@ -91,7 +91,7 @@ index a9cc4df..ce3769c 100644
 +
 +            std::uint32_t bucket_idx = 0;
          }; // class SenderOperationOPRF
-
+ 
          /**
 @@ -140,6 +142,8 @@ namespace apsi {
              ciphertext and the vector holds the ciphertext data for different bundle indices.
@@ -108,12 +108,12 @@ index 4c4e116..8eb34fc 100644
 --- a/common/apsi/network/sop.fbs
 +++ b/common/apsi/network/sop.fbs
 @@ -10,6 +10,7 @@ table ParmsRequest {
-
+ 
  table OPRFRequest {
      data:[ubyte] (required);
 +    bucket_idx:uint32;
  }
-
+ 
  table QueryRequestPart {
 @@ -21,6 +22,7 @@ table QueryRequest {
      compression_type:ubyte;
@@ -121,5 +121,5 @@ index 4c4e116..8eb34fc 100644
      query:[QueryRequestPart] (required);
 +    bucket_idx:uint32;
  }
-
+ 
  union Request { ParmsRequest, OPRFRequest, QueryRequest }
diff --git a/bazel/patches/boost.patch b/bazel/patches/boost.patch
deleted file mode 100644
index 6772b61..0000000
--- a/bazel/patches/boost.patch
+++ /dev/null
@@ -1,42 +0,0 @@
-diff --git a/config.lzma-linux.h b/config.lzma-linux.h
-index e8b00d8..092696f 100644
---- a/config.lzma-linux.h
-+++ b/config.lzma-linux.h
-@@ -56,7 +56,9 @@
- /* #undef HAVE_COMMONCRYPTO_COMMONDIGEST_H */
-
- /* Define to 1 if you have the <cpuid.h> header file. */
--#define HAVE_CPUID_H 1
-+#ifdef __x86_64__
-+   #define HAVE_CPUID_H 1
-+#endif
-
- /* Define if the GNU dcgettext() function is already present or preinstalled.
-    */
-@@ -309,7 +311,9 @@
-
- /* Define to 1 if _mm_clmulepi64_si128 is usable. See configure.ac for
-    details. */
-+#ifdef __x86_64__
- #define HAVE_USABLE_CLMUL 1
-+#endif
-
- /* Define to 1 if you have the `utime' function. */
- /* #undef HAVE_UTIME */
-diff --git a/boost/boost.bzl b/boost/boost.bzl
-index 8277dbb..afc9569 100644
---- a/boost/boost.bzl
-+++ b/boost/boost.bzl
-@@ -139,9 +139,9 @@ def boost_deps():
-         http_archive,
-         name = "org_lzma_lzma",
-         build_file = "@com_github_nelhage_rules_boost//:lzma.BUILD",
--        url = "https://github.com/tukaani-project/xz/releases/download/v5.4.4/xz-5.4.4.tar.gz",
--        sha256 = "aae39544e254cfd27e942d35a048d592959bd7a79f9a624afb0498bb5613bdf8",
--        strip_prefix = "xz-5.4.4",
-+        url = "https://src.fedoraproject.org/lookaside/extras/xz/xz-5.4.6.tar.gz/sha512/b08a61d8d478d3b4675cb1ddacdbbd98dc6941a55bcdd81a28679e54e9367d3a595fa123ac97874a17da571c1b712e2a3e901c2737099a9d268616a1ba3de497/xz-5.4.6.tar.gz",
-+        sha256 = "aeba3e03bf8140ddedf62a0a367158340520f6b384f75ca6045ccc6c0d43fd5c",
-+        strip_prefix = "xz-5.4.6",
-     )
- 
-     maybe(
\ No newline at end of file
diff --git a/bazel/patches/emp-tool-cmake.patch b/bazel/patches/emp-tool-cmake.patch
deleted file mode 100644
index 01aa13d..0000000
--- a/bazel/patches/emp-tool-cmake.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index d9abb31..4c2c171 100755
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -56,11 +56,14 @@ find_package(OpenSSL REQUIRED)
- include_directories(${OPENSSL_INCLUDE_DIR})
- 
- 
--add_library(${NAME} SHARED ${sources})
-+add_library(${NAME} STATIC ${sources})
- 
- install(DIRECTORY emp-tool DESTINATION include/)
- install(DIRECTORY cmake/ DESTINATION cmake/)
- install(TARGETS ${NAME} DESTINATION lib)
- 
--ENABLE_TESTING()
--ADD_SUBDIRECTORY(test)
-+option(ENABLE_TESTS "Enable tests" OFF)
-+if (${ENABLE_TESTS})
-+    ENABLE_TESTING()
-+    ADD_SUBDIRECTORY(test)
-+endif()
diff --git a/bazel/patches/emp-tool-sse2neon.patch b/bazel/patches/emp-tool-sse2neon.patch
deleted file mode 100644
index e94b22e..0000000
--- a/bazel/patches/emp-tool-sse2neon.patch
+++ /dev/null
@@ -1,6507 +0,0 @@
-diff --git a/emp-tool/utils/sse2neon.h b/emp-tool/utils/sse2neon.h
-index d09b9c7..efa63a4 100644
---- a/emp-tool/utils/sse2neon.h
-+++ b/emp-tool/utils/sse2neon.h
-@@ -113,7 +113,7 @@
- #ifdef _MSC_VER
- #include <intrin.h>
- #if (defined(_M_AMD64) || defined(__x86_64__)) || \
--    (defined(_M_ARM) || defined(__arm__))
-+    (defined(_M_ARM64) || defined(__arm64__))
- #define SSE2NEON_HAS_BITSCAN64
- #endif
- #endif
-@@ -441,7 +441,7 @@ typedef int64x2_t __m128i; /* 128-bit vector containing integers */
- // by applications which attempt to access the contents of an __m128 struct
- // directly.  It is important to note that accessing the __m128 struct directly
- // is bad coding practice by Microsoft: @see:
--// https://docs.microsoft.com/en-us/cpp/cpp/m128
-+// https://learn.microsoft.com/en-us/cpp/cpp/m128
- //
- // However, some legacy source code may try to access the contents of an __m128
- // struct directly so the developer can use the SIMDVec as an alias for it.  Any
-@@ -621,47 +621,6 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
-  *                                  4, 5, 12, 13, 6, 7, 14, 15);
-  *   // Shuffle packed 8-bit integers
-  *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
-- *
-- * Data (Number, Binary, Byte Index):
--    +------+------+-------------+------+------+-------------+
--    |      1      |      2      |      3      |      4      | Number
--    +------+------+------+------+------+------+------+------+
--    | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
--    +------+------+------+------+------+------+------+------+
--    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
--    +------+------+------+------+------+------+------+------+
--
--    +------+------+------+------+------+------+------+------+
--    |      5      |      6      |      7      |      8      | Number
--    +------+------+------+------+------+------+------+------+
--    | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
--    +------+------+------+------+------+------+------+------+
--    |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
--    +------+------+------+------+------+------+------+------+
-- * Index (Byte Index):
--    +------+------+------+------+------+------+------+------+
--    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
--    +------+------+------+------+------+------+------+------+
--
--    +------+------+------+------+------+------+------+------+
--    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
--    +------+------+------+------+------+------+------+------+
-- * Result:
--    +------+------+------+------+------+------+------+------+
--    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
--    +------+------+------+------+------+------+------+------+
--    | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
--    +------+------+------+------+------+------+------+------+
--    |     256     |      2      |      5      |      6      | Number
--    +------+------+------+------+------+------+------+------+
--
--    +------+------+------+------+------+------+------+------+
--    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
--    +------+------+------+------+------+------+------+------+
--    | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
--    +------+------+------+------+------+------+------+------+
--    |      3      |      7      |      4      |      8      | Number
--    +------+------+------+------+------+------+-------------+
-  */
- 
- /* Constants for use with _mm_prefetch. */
-@@ -1069,9 +1028,9 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
-     })
- #endif
- 
--// NEON does not support a general purpose permute intrinsic
--// Selects four specific single-precision, floating-point values from a and b,
--// based on the mask i.
-+// NEON does not support a general purpose permute intrinsic.
-+// Shuffle single-precision (32-bit) floating-point elements in a using the
-+// control in imm8, and store the results in dst.
- //
- // C equivalent:
- //   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
-@@ -1082,7 +1041,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
- //       return ret;
- //   }
- //
--// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
- #define _mm_shuffle_ps_default(a, b, imm)                                  \
-     __extension__({                                                        \
-         float32x4_t ret;                                                   \
-@@ -1100,12 +1059,10 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
-         vreinterpretq_m128_f32(ret);                                       \
-     })
- 
--// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
--// by imm.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
--// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
--//                                                   __constrange(0,255) int
--//                                                   imm)
-+// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
-+// Store the results in the low 64 bits of dst, with the high 64 bits being
-+// copied from from a to dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
- #define _mm_shufflelo_epi16_function(a, imm)                                  \
-     __extension__({                                                           \
-         int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
-@@ -1120,12 +1077,10 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
-         vreinterpretq_m128i_s16(ret);                                         \
-     })
- 
--// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
--// by imm.
--// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
--// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
--//                                                   __constrange(0,255) int
--//                                                   imm)
-+// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
-+// Store the results in the high 64 bits of dst, with the low 64 bits being
-+// copied from from a to dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
- #define _mm_shufflehi_epi16_function(a, imm)                                   \
-     __extension__({                                                            \
-         int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
-@@ -1147,22 +1102,19 @@ FORCE_INLINE void _mm_empty(void) {}
- 
- /* SSE */
- 
--// Adds the four single-precision, floating-point values of a and b.
--//
--//   r0 := a0 + b0
--//   r1 := a1 + b1
--//   r2 := a2 + b2
--//   r3 := a3 + b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
-+// Add packed single-precision (32-bit) floating-point elements in a and b, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
- FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_f32(
-         vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
- 
--// adds the scalar single-precision floating point values of a and b.
--// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
-+// Add the lower single-precision (32-bit) floating-point element in a and b,
-+// store the result in the lower element of dst, and copy the upper 3 packed
-+// elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
- FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
- {
-     float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-@@ -1171,30 +1123,18 @@ FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
-     return vreinterpretq_m128_f32(vaddq_f32(a, value));
- }
- 
--// Computes the bitwise AND of the four single-precision, floating-point values
--// of a and b.
--//
--//   r0 := a0 & b0
--//   r1 := a1 & b1
--//   r2 := a2 & b2
--//   r3 := a3 & b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
-+// Compute the bitwise AND of packed single-precision (32-bit) floating-point
-+// elements in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
- FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_s32(
-         vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
- }
- 
--// Computes the bitwise AND-NOT of the four single-precision, floating-point
--// values of a and b.
--//
--//   r0 := ~a0 & b0
--//   r1 := ~a1 & b1
--//   r2 := ~a2 & b2
--//   r3 := ~a3 & b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
-+// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
-+// elements in a and then AND with b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
- FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_s32(
-@@ -1204,13 +1144,7 @@ FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
- 
- // Average packed unsigned 16-bit integers in a and b, and store the results in
- // dst.
--//
--//   FOR j := 0 to 3
--//     i := j*16
--//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
- FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_u16(
-@@ -1219,186 +1153,199 @@ FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
- 
- // Average packed unsigned 8-bit integers in a and b, and store the results in
- // dst.
--//
--//   FOR j := 0 to 7
--//     i := j*8
--//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
- FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_u8(
-         vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
- }
- 
--// Compares for equality.
--// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for equality, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
- FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(
-         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
- 
--// Compares for equality.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for equality, store the result in the lower element of dst, and copy the
-+// upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
- FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
- }
- 
--// Compares for greater than or equal.
--// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for greater-than-or-equal, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
- FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(
-         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
- 
--// Compares for greater than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for greater-than-or-equal, store the result in the lower element of dst,
-+// and copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
- FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpge_ps(a, b));
- }
- 
--// Compares for greater than.
--//
--//   r0 := (a0 > b0) ? 0xffffffff : 0x0
--//   r1 := (a1 > b1) ? 0xffffffff : 0x0
--//   r2 := (a2 > b2) ? 0xffffffff : 0x0
--//   r3 := (a3 > b3) ? 0xffffffff : 0x0
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for greater-than, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
- FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(
-         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
- 
--// Compares for greater than.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for greater-than, store the result in the lower element of dst, and copy
-+// the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
- FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
- }
- 
--// Compares for less than or equal.
--//
--//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
--//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
--//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
--//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for less-than-or-equal, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
- FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(
-         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
- 
--// Compares for less than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for less-than-or-equal, store the result in the lower element of dst, and
-+// copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
- FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmple_ps(a, b));
- }
- 
--// Compares for less than
--// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for less-than, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
- FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(
-         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
- 
--// Compares for less than
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for less-than, store the result in the lower element of dst, and copy the
-+// upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
- FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmplt_ps(a, b));
- }
- 
--// Compares for inequality.
--// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for not-equal, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
- FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(vmvnq_u32(
-         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
- }
- 
--// Compares for inequality.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for not-equal, store the result in the lower element of dst, and copy the
-+// upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
- FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
- }
- 
--// Compares for not greater than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for not-greater-than-or-equal, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
- FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(vmvnq_u32(
-         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
- }
- 
--// Compares for not greater than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for not-greater-than-or-equal, store the result in the lower element of
-+// dst, and copy the upper 3 packed elements from a to the upper elements of
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
- FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
- }
- 
--// Compares for not greater than.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for not-greater-than, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
- FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(vmvnq_u32(
-         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
- }
- 
--// Compares for not greater than.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for not-greater-than, store the result in the lower element of dst, and
-+// copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
- FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
- }
- 
--// Compares for not less than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for not-less-than-or-equal, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
- FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(vmvnq_u32(
-         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
- }
- 
--// Compares for not less than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for not-less-than-or-equal, store the result in the lower element of dst,
-+// and copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
- FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
- }
- 
--// Compares for not less than.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for not-less-than, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
- FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(vmvnq_u32(
-         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
- }
- 
--// Compares for not less than.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for not-less-than, store the result in the lower element of dst, and copy
-+// the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
- FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
- }
- 
--// Compares the four 32-bit floats in a and b to check if any values are NaN.
--// Ordered compare between each value returns true for "orderable" and false for
--// "not orderable" (NaN).
--// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
--// also:
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// to see if neither is NaN, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
-+//
-+// See also:
- // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
- // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
- FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
-@@ -1413,15 +1360,18 @@ FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
-     return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
- }
- 
--// Compares for ordered.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b to see if neither is NaN, store the result in the lower element of dst, and
-+// copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
- FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpord_ps(a, b));
- }
- 
--// Compares for unordered.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// to see if either is NaN, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
- FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
- {
-     uint32x4_t f32a =
-@@ -1431,16 +1381,18 @@ FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
-     return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
- }
- 
--// Compares for unordered.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b to see if either is NaN, store the result in the lower element of dst, and
-+// copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
- FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
- }
- 
--// Compares the lower single-precision floating point scalar values of a and b
--// using an equality operation. :
--// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for equality, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
- FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
- {
-     uint32x4_t a_eq_b =
-@@ -1448,9 +1400,9 @@ FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
-     return vgetq_lane_u32(a_eq_b, 0) & 0x1;
- }
- 
--// Compares the lower single-precision floating point scalar values of a and b
--// using a greater than or equal operation. :
--// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for greater-than-or-equal, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
- FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
- {
-     uint32x4_t a_ge_b =
-@@ -1458,9 +1410,9 @@ FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
-     return vgetq_lane_u32(a_ge_b, 0) & 0x1;
- }
- 
--// Compares the lower single-precision floating point scalar values of a and b
--// using a greater than operation. :
--// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for greater-than, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
- FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
- {
-     uint32x4_t a_gt_b =
-@@ -1468,9 +1420,9 @@ FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
-     return vgetq_lane_u32(a_gt_b, 0) & 0x1;
- }
- 
--// Compares the lower single-precision floating point scalar values of a and b
--// using a less than or equal operation. :
--// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for less-than-or-equal, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
- FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
- {
-     uint32x4_t a_le_b =
-@@ -1478,11 +1430,9 @@ FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
-     return vgetq_lane_u32(a_le_b, 0) & 0x1;
- }
- 
--// Compares the lower single-precision floating point scalar values of a and b
--// using a less than operation. :
--// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
--// note!! The documentation on MSDN is incorrect!  If either of the values is a
--// NAN the docs say you will get a one, but in fact, it will return a zero!!
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for less-than, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
- FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
- {
-     uint32x4_t a_lt_b =
-@@ -1490,9 +1440,9 @@ FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
-     return vgetq_lane_u32(a_lt_b, 0) & 0x1;
- }
- 
--// Compares the lower single-precision floating point scalar values of a and b
--// using an inequality operation. :
--// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for not-equal, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
- FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
- {
-     return !_mm_comieq_ss(a, b);
-@@ -1502,13 +1452,7 @@ FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
- // (32-bit) floating-point elements, store the results in the lower 2 elements
- // of dst, and copy the upper 2 packed elements from a to the upper elements of
- // dst.
--//
--//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
--//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
--//   dst[95:64] := a[95:64]
--//   dst[127:96] := a[127:96]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
- FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
- {
-     return vreinterpretq_m128_f32(
-@@ -1518,13 +1462,7 @@ FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
- 
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed 32-bit integers, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//       i := 32*j
--//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
- FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
- {
- #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-@@ -1539,11 +1477,7 @@ FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
- // Convert the signed 32-bit integer b to a single-precision (32-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper 3 packed elements from a to the upper elements of dst.
--//
--//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
- FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
- {
-     return vreinterpretq_m128_f32(
-@@ -1552,7 +1486,7 @@ FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
- 
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 32-bit integer, and store the result in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
- FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
- {
- #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-@@ -1567,14 +1501,7 @@ FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
- 
- // Convert packed 16-bit integers in a to packed single-precision (32-bit)
- // floating-point elements, and store the results in dst.
--//
--//   FOR j := 0 to 3
--//      i := j*16
--//      m := j*32
--//      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
- FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
- {
-     return vreinterpretq_m128_f32(
-@@ -1584,13 +1511,7 @@ FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
- // Convert packed 32-bit integers in b to packed single-precision (32-bit)
- // floating-point elements, store the results in the lower 2 elements of dst,
- // and copy the upper 2 packed elements from a to the upper elements of dst.
--//
--//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
--//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
--//   dst[95:64] := a[95:64]
--//   dst[127:96] := a[127:96]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
- FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
- {
-     return vreinterpretq_m128_f32(
-@@ -1603,13 +1524,7 @@ FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
- // of dst, then convert the packed signed 32-bit integers in b to
- // single-precision (32-bit) floating-point element, and store the results in
- // the upper 2 elements of dst.
--//
--//   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
--//   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
--//   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
--//   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
- FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
- {
-     return vreinterpretq_m128_f32(vcvtq_f32_s32(
-@@ -1618,14 +1533,7 @@ FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
- 
- // Convert the lower packed 8-bit integers in a to packed single-precision
- // (32-bit) floating-point elements, and store the results in dst.
--//
--//   FOR j := 0 to 3
--//      i := j*8
--//      m := j*32
--//      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
- FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
- {
-     return vreinterpretq_m128_f32(vcvtq_f32_s32(
-@@ -1636,18 +1544,7 @@ FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
- // packed 16-bit integers, and store the results in dst. Note: this intrinsic
- // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
- // 0x7FFFFFFF.
--//
--//   FOR j := 0 to 3
--//     i := 16*j
--//     k := 32*j
--//     IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)
--//       dst[i+15:i] := 0x7FFF
--//     ELSE
--//       dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
- FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
- {
-     return vreinterpret_m64_s16(
-@@ -1656,31 +1553,14 @@ FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
- 
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed 32-bit integers, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//       i := 32*j
--//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
- #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
- 
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed 8-bit integers, and store the results in lower 4 elements of dst.
- // Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
- // between 0x7F and 0x7FFFFFFF.
--//
--//   FOR j := 0 to 3
--//     i := 8*j
--//     k := 32*j
--//     IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)
--//       dst[i+7:i] := 0x7F
--//     ELSE
--//       dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
- FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
- {
-     return vreinterpret_m64_s8(vqmovn_s16(
-@@ -1689,14 +1569,7 @@ FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
- 
- // Convert packed unsigned 16-bit integers in a to packed single-precision
- // (32-bit) floating-point elements, and store the results in dst.
--//
--//   FOR j := 0 to 3
--//      i := j*16
--//      m := j*32
--//      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
- FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
- {
-     return vreinterpretq_m128_f32(
-@@ -1706,14 +1579,7 @@ FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
- // Convert the lower packed unsigned 8-bit integers in a to packed
- // single-precision (32-bit) floating-point elements, and store the results in
- // dst.
--//
--//   FOR j := 0 to 3
--//      i := j*8
--//      m := j*32
--//      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
- FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
- {
-     return vreinterpretq_m128_f32(vcvtq_f32_u32(
-@@ -1723,21 +1589,13 @@ FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
- // Convert the signed 32-bit integer b to a single-precision (32-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper 3 packed elements from a to the upper elements of dst.
--//
--//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
- #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
- 
- // Convert the signed 64-bit integer b to a single-precision (32-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper 3 packed elements from a to the upper elements of dst.
--//
--//   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
- FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
- {
-     return vreinterpretq_m128_f32(
-@@ -1745,10 +1603,7 @@ FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
- }
- 
- // Copy the lower single-precision (32-bit) floating-point element of a to dst.
--//
--//   dst[31:0] := a[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
- FORCE_INLINE float _mm_cvtss_f32(__m128 a)
- {
-     return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-@@ -1756,18 +1611,12 @@ FORCE_INLINE float _mm_cvtss_f32(__m128 a)
- 
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 32-bit integer, and store the result in dst.
--//
--//   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
- #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
- 
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 64-bit integer, and store the result in dst.
--//
--//   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
- FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
- {
- #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-@@ -1781,13 +1630,7 @@ FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
- 
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed 32-bit integers with truncation, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//      i := 32*j
--//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
- FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
- {
-     return vreinterpret_m64_s32(
-@@ -1796,10 +1639,7 @@ FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
- 
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 32-bit integer with truncation, and store the result in dst.
--//
--//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
- FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
- {
-     return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
-@@ -1807,60 +1647,49 @@ FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
- 
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed 32-bit integers with truncation, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//      i := 32*j
--//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
- #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
- 
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 32-bit integer with truncation, and store the result in dst.
--//
--//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
- #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
- 
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 64-bit integer with truncation, and store the result in dst.
--//
--//   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
- FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
- {
-     return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
- }
- 
--// Divides the four single-precision, floating-point values of a and b.
--//
--//   r0 := a0 / b0
--//   r1 := a1 / b1
--//   r2 := a2 / b2
--//   r3 := a3 / b3
--//
--// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
-+// Divide packed single-precision (32-bit) floating-point elements in a by
-+// packed elements in b, and store the results in dst.
-+// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
-+// division by multiplying a by b's reciprocal before using the Newton-Raphson
-+// method to approximate the results.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
- FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
- {
--#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
-+#if defined(__aarch64__)
-     return vreinterpretq_m128_f32(
-         vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- #else
-     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
-     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
--#if SSE2NEON_PRECISE_DIV
-     // Additional Netwon-Raphson iteration for accuracy
-     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
--#endif
-     return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
- #endif
- }
- 
--// Divides the scalar single-precision floating point value of a by b.
--// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
-+// Divide the lower single-precision (32-bit) floating-point element in a by the
-+// lower single-precision (32-bit) floating-point element in b, store the result
-+// in the lower element of dst, and copy the upper 3 packed elements from a to
-+// the upper elements of dst.
-+// Warning: ARMv7-A does not produce the same result compared to Intel and not
-+// IEEE-compliant.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
- FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
- {
-     float32_t value =
-@@ -1871,12 +1700,12 @@ FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
- 
- // Extract a 16-bit integer from a, selected with imm8, and store the result in
- // the lower element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
- #define _mm_extract_pi16(a, imm) \
-     (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
- 
- // Free aligned memory that was allocated with _mm_malloc.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
- #if !defined(SSE2NEON_ALLOC_DEFINED)
- FORCE_INLINE void _mm_free(void *addr)
- {
-@@ -1887,7 +1716,7 @@ FORCE_INLINE void _mm_free(void *addr)
- // Macro: Get the flush zero bits from the MXCSR control and status register.
- // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
- // _MM_FLUSH_ZERO_OFF
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
- FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
- {
-     union {
-@@ -1911,7 +1740,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
- // Macro: Get the rounding mode bits from the MXCSR control and status register.
- // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
- // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
- FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
- {
-     union {
-@@ -1938,15 +1767,17 @@ FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
- 
- // Copy a to dst, and insert the 16-bit integer i into dst at the location
- // specified by imm8.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
- #define _mm_insert_pi16(a, b, imm)                               \
-     __extension__({                                              \
-         vreinterpret_m64_s16(                                    \
-             vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
-     })
- 
--// Loads four single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
-+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
-+// boundary or a general-protection exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
- FORCE_INLINE __m128 _mm_load_ps(const float *p)
- {
-     return vreinterpretq_m128_f32(vld1q_f32(p));
-@@ -1960,52 +1791,40 @@ FORCE_INLINE __m128 _mm_load_ps(const float *p)
- //   dst[95:64] := MEM[mem_addr+31:mem_addr]
- //   dst[127:96] := MEM[mem_addr+31:mem_addr]
- //
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
- #define _mm_load_ps1 _mm_load1_ps
- 
--// Loads an single - precision, floating - point value into the low word and
--// clears the upper three words.
--// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
-+// Load a single-precision (32-bit) floating-point element from memory into the
-+// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
-+// aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
- FORCE_INLINE __m128 _mm_load_ss(const float *p)
- {
-     return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
- }
- 
--// Loads a single single-precision, floating-point value, copying it into all
--// four words
--// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
-+// Load a single-precision (32-bit) floating-point element from memory into all
-+// elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
- FORCE_INLINE __m128 _mm_load1_ps(const float *p)
- {
-     return vreinterpretq_m128_f32(vld1q_dup_f32(p));
- }
- 
--// Sets the upper two single-precision, floating-point values with 64
--// bits of data loaded from the address p; the lower two values are passed
--// through from a.
--//
--//   r0 := a0
--//   r1 := a1
--//   r2 := *p0
--//   r3 := *p1
--//
--// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
-+// Load 2 single-precision (32-bit) floating-point elements from memory into the
-+// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
-+// mem_addr does not need to be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
- FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
- {
-     return vreinterpretq_m128_f32(
-         vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
- }
- 
--// Sets the lower two single-precision, floating-point values with 64
--// bits of data loaded from the address p; the upper two values are passed
--// through from a.
--//
--// Return Value
--//   r0 := *p0
--//   r1 := *p1
--//   r2 := a2
--//   r3 := a3
--//
--// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
-+// Load 2 single-precision (32-bit) floating-point elements from memory into the
-+// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
-+// mem_addr does not need to be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
- FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
- {
-     return vreinterpretq_m128_f32(
-@@ -2015,21 +1834,17 @@ FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
- // Load 4 single-precision (32-bit) floating-point elements from memory into dst
- // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
- // general-protection exception may be generated.
--//
--//   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
--//   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
--//   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
--//   dst[127:96] := MEM[mem_addr+31:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
- FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
- {
-     float32x4_t v = vrev64q_f32(vld1q_f32(p));
-     return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
- }
- 
--// Loads four single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
-+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-+// elements) from memory into dst. mem_addr does not need to be aligned on any
-+// particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
- FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
- {
-     // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
-@@ -2038,11 +1853,7 @@ FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
- }
- 
- // Load unaligned 16-bit integer from memory into the first element of dst.
--//
--//   dst[15:0] := MEM[mem_addr+15:mem_addr]
--//   dst[MAX:16] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
- FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
- {
-     return vreinterpretq_m128i_s16(
-@@ -2050,20 +1861,17 @@ FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
- }
- 
- // Load unaligned 64-bit integer from memory into the first element of dst.
--//
--//   dst[63:0] := MEM[mem_addr+63:mem_addr]
--//   dst[MAX:64] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
- FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
- {
-     return vreinterpretq_m128i_s64(
-         vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
- }
- 
--// Allocate aligned blocks of memory.
--// https://software.intel.com/en-us/
--//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
-+// Allocate size bytes of memory, aligned to the alignment specified in align,
-+// and return a pointer to the allocated memory. _mm_free should be used to free
-+// memory that is allocated with _mm_malloc.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
- #if !defined(SSE2NEON_ALLOC_DEFINED)
- FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
- {
-@@ -2081,7 +1889,7 @@ FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
- // Conditionally store 8-bit integer elements from a into memory using mask
- // (elements are not stored when the highest bit is not set in the corresponding
- // element) and a non-temporal memory hint.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
- FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
- {
-     int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
-@@ -2095,27 +1903,23 @@ FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
- // Conditionally store 8-bit integer elements from a into memory using mask
- // (elements are not stored when the highest bit is not set in the corresponding
- // element) and a non-temporal memory hint.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
- #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
- 
- // Compare packed signed 16-bit integers in a and b, and store packed maximum
- // values in dst.
--//
--//   FOR j := 0 to 3
--//      i := j*16
--//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
- FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_s16(
-         vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
- }
- 
--// Computes the maximums of the four single-precision, floating-point values of
--// a and b.
--// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b,
-+// and store packed maximum values in dst. dst does not follow the IEEE Standard
-+// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
-+// signed-zero values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
- FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
- {
- #if SSE2NEON_PRECISE_MINMAX
-@@ -2130,22 +1934,19 @@ FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
- 
- // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
- // values in dst.
--//
--//   FOR j := 0 to 7
--//      i := j*8
--//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
- FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_u8(
-         vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
- }
- 
--// Computes the maximum of the two lower scalar single-precision floating point
--// values of a and b.
--// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b, store the maximum value in the lower element of dst, and copy the upper 3
-+// packed elements from a to the upper element of dst. dst does not follow the
-+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
-+// inputs are NaN or signed-zero values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
- FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
- {
-     float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
-@@ -2155,22 +1956,18 @@ FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
- 
- // Compare packed signed 16-bit integers in a and b, and store packed minimum
- // values in dst.
--//
--//   FOR j := 0 to 3
--//      i := j*16
--//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
- FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_s16(
-         vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
- }
- 
--// Computes the minima of the four single-precision, floating-point values of a
--// and b.
--// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b,
-+// and store packed minimum values in dst. dst does not follow the IEEE Standard
-+// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
-+// signed-zero values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
- FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
- {
- #if SSE2NEON_PRECISE_MINMAX
-@@ -2185,22 +1982,19 @@ FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
- 
- // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
- // values in dst.
--//
--//   FOR j := 0 to 7
--//      i := j*8
--//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
- FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_u8(
-         vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
- }
- 
--// Computes the minimum of the two lower scalar single-precision floating point
--// values of a and b.
--// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b, store the minimum value in the lower element of dst, and copy the upper 3
-+// packed elements from a to the upper element of dst. dst does not follow the
-+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
-+// inputs are NaN or signed-zero values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
- FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
- {
-     float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
-@@ -2208,8 +2002,10 @@ FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
-         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
- }
- 
--// Sets the low word to the single-precision, floating-point value of b
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
-+// Move the lower single-precision (32-bit) floating-point element from b to the
-+// lower element of dst, and copy the upper 3 packed elements from a to the
-+// upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
- FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_f32(
-@@ -2217,25 +2013,26 @@ FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
-                        vreinterpretq_f32_m128(a), 0));
- }
- 
--// Moves the upper two values of B into the lower two values of A.
--//
--//   r3 := a3
--//   r2 := a2
--//   r1 := b3
--//   r0 := b2
--FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
--{
--    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
--    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
-+// Move the upper 2 single-precision (32-bit) floating-point elements from b to
-+// the lower 2 elements of dst, and copy the upper 2 elements from a to the
-+// upper 2 elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
-+FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
-+{
-+#if defined(aarch64__)
-+    return vreinterpretq_m128_u64(
-+        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
-+#else
-+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-     return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
-+#endif
- }
- 
--// Moves the lower two values of B into the upper two values of A.
--//
--//   r3 := b1
--//   r2 := b0
--//   r1 := a1
--//   r0 := a0
-+// Move the lower 2 single-precision (32-bit) floating-point elements from b to
-+// the upper 2 elements of dst, and copy the lower 2 elements from a to the
-+// lower 2 elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
- FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
- {
-     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
-@@ -2245,7 +2042,7 @@ FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
- 
- // Create mask from the most significant bit of each 8-bit element in a, and
- // store the result in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
- FORCE_INLINE int _mm_movemask_pi8(__m64 a)
- {
-     uint8x8_t input = vreinterpret_u8_m64(a);
-@@ -2264,10 +2061,9 @@ FORCE_INLINE int _mm_movemask_pi8(__m64 a)
- #endif
- }
- 
--// NEON does not provide this method
--// Creates a 4-bit mask from the most significant bits of the four
--// single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
-+// Set each bit of mask dst based on the most significant bit of the
-+// corresponding packed single-precision (32-bit) floating-point element in a.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
- FORCE_INLINE int _mm_movemask_ps(__m128 a)
- {
-     uint32x4_t input = vreinterpretq_u32_m128(a);
-@@ -2288,14 +2084,9 @@ FORCE_INLINE int _mm_movemask_ps(__m128 a)
- #endif
- }
- 
--// Multiplies the four single-precision, floating-point values of a and b.
--//
--//   r0 := a0 * b0
--//   r1 := a1 * b1
--//   r2 := a2 * b2
--//   r3 := a3 * b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
-+// Multiply packed single-precision (32-bit) floating-point elements in a and b,
-+// and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
- FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_f32(
-@@ -2305,11 +2096,7 @@ FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
- // Multiply the lower single-precision (32-bit) floating-point element in a and
- // b, store the result in the lower element of dst, and copy the upper 3 packed
- // elements from a to the upper elements of dst.
--//
--//   dst[31:0] := a[31:0] * b[31:0]
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
- FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_mul_ps(a, b));
-@@ -2318,16 +2105,16 @@ FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
- // Multiply the packed unsigned 16-bit integers in a and b, producing
- // intermediate 32-bit integers, and store the high 16 bits of the intermediate
- // integers in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
- FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_u16(vshrn_n_u32(
-         vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
- }
- 
--// Computes the bitwise OR of the four single-precision, floating-point values
--// of a and b.
--// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
-+// Compute the bitwise OR of packed single-precision (32-bit) floating-point
-+// elements in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
- FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_s32(
-@@ -2336,65 +2123,53 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
- 
- // Average packed unsigned 8-bit integers in a and b, and store the results in
- // dst.
--//
--//   FOR j := 0 to 7
--//     i := j*8
--//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
- #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
- 
- // Average packed unsigned 16-bit integers in a and b, and store the results in
- // dst.
--//
--//   FOR j := 0 to 3
--//     i := j*16
--//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
- #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
- 
- // Extract a 16-bit integer from a, selected with imm8, and store the result in
- // the lower element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
- #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
- 
- // Copy a to dst, and insert the 16-bit integer i into dst at the location
- // specified by imm8.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
- #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
- 
- // Compare packed signed 16-bit integers in a and b, and store packed maximum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
- #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
- 
- // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
- #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
- 
- // Compare packed signed 16-bit integers in a and b, and store packed minimum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
- #define _m_pminsw(a, b) _mm_min_pi16(a, b)
- 
- // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
- #define _m_pminub(a, b) _mm_min_pu8(a, b)
- 
- // Create mask from the most significant bit of each 8-bit element in a, and
- // store the result in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
- #define _m_pmovmskb(a) _mm_movemask_pi8(a)
- 
- // Multiply the packed unsigned 16-bit integers in a and b, producing
- // intermediate 32-bit integers, and store the high 16 bits of the intermediate
- // integers in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
- #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
- 
- // Fetch the line of data from memory that contains address p to a location in
-@@ -2422,26 +2197,22 @@ FORCE_INLINE void _mm_prefetch(char const *p, int i)
- // b, then horizontally sum each consecutive 8 differences to produce four
- // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
- // 16 bits of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
- #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
- 
- // Shuffle 16-bit integers in a using the control in imm8, and store the results
- // in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
- #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
- 
- // Compute the approximate reciprocal of packed single-precision (32-bit)
- // floating-point elements in a, and store the results in dst. The maximum
- // relative error for this approximation is less than 1.5*2^-12.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
- FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
- {
-     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
-     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
--#if SSE2NEON_PRECISE_DIV
--    // Additional Netwon-Raphson iteration for accuracy
--    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
--#endif
-     return vreinterpretq_m128_f32(recip);
- }
- 
-@@ -2449,30 +2220,21 @@ FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
- // floating-point element in a, store the result in the lower element of dst,
- // and copy the upper 3 packed elements from a to the upper elements of dst. The
- // maximum relative error for this approximation is less than 1.5*2^-12.
--//
--//   dst[31:0] := (1.0 / a[31:0])
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
- FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
- {
-     return _mm_move_ss(a, _mm_rcp_ps(a));
- }
- 
--// Computes the approximations of the reciprocal square roots of the four
--// single-precision floating point values of in.
--// The current precision is 1% error.
--// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
-+// Compute the approximate reciprocal square root of packed single-precision
-+// (32-bit) floating-point elements in a, and store the results in dst. The
-+// maximum relative error for this approximation is less than 1.5*2^-12.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
- FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
- {
-     float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
--#if SSE2NEON_PRECISE_SQRT
--    // Additional Netwon-Raphson iteration for accuracy
-     out = vmulq_f32(
-         out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
--    out = vmulq_f32(
--        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
--#endif
-     return vreinterpretq_m128_f32(out);
- }
- 
-@@ -2480,7 +2242,7 @@ FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
- // (32-bit) floating-point element in a, store the result in the lower element
- // of dst, and copy the upper 3 packed elements from a to the upper elements of
- // dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
- FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
- {
-     return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
-@@ -2490,7 +2252,7 @@ FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
- // b, then horizontally sum each consecutive 8 differences to produce four
- // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
- // 16 bits of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
- FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
- {
-     uint64x1_t t = vpaddl_u32(vpaddl_u16(
-@@ -2502,7 +2264,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
- // Macro: Set the flush zero bits of the MXCSR control and status register to
- // the value in unsigned 32-bit integer a. The flush zero may contain any of the
- // following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
- FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
- {
-     // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
-@@ -2531,16 +2293,18 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
- #endif
- }
- 
--// Sets the four single-precision, floating-point values to the four inputs.
--// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
-+// Set packed single-precision (32-bit) floating-point elements in dst with the
-+// supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
- FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
- {
-     float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
-     return vreinterpretq_m128_f32(vld1q_f32(data));
- }
- 
--// Sets the four single-precision, floating-point values to w.
--// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-+// Broadcast single-precision (32-bit) floating-point value a to all elements of
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
- FORCE_INLINE __m128 _mm_set_ps1(float _w)
- {
-     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-@@ -2550,7 +2314,7 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
- // the value in unsigned 32-bit integer a. The rounding mode may contain any of
- // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
- // _MM_ROUND_TOWARD_ZERO
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
- FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
- {
-     union {
-@@ -2595,45 +2359,48 @@ FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
- 
- // Copy single-precision (32-bit) floating-point element a to the lower element
- // of dst, and zero the upper 3 elements.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
- FORCE_INLINE __m128 _mm_set_ss(float a)
- {
-     return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
- }
- 
--// Sets the four single-precision, floating-point values to w.
--//
--//   r0 := r1 := r2 := r3 := w
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-+// Broadcast single-precision (32-bit) floating-point value a to all elements of
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
- FORCE_INLINE __m128 _mm_set1_ps(float _w)
- {
-     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
- }
- 
-+// Set the MXCSR control and status register with the value in unsigned 32-bit
-+// integer a.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
- // FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
- FORCE_INLINE void _mm_setcsr(unsigned int a)
- {
-     _MM_SET_ROUNDING_MODE(a);
- }
- 
-+// Get the unsigned 32-bit value of the MXCSR control and status register.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
- // FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
- FORCE_INLINE unsigned int _mm_getcsr()
- {
-     return _MM_GET_ROUNDING_MODE();
- }
- 
--// Sets the four single-precision, floating-point values to the four inputs in
--// reverse order.
--// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
-+// Set packed single-precision (32-bit) floating-point elements in dst with the
-+// supplied values in reverse order.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
- FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
- {
-     float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
-     return vreinterpretq_m128_f32(vld1q_f32(data));
- }
- 
--// Clears the four single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
-+// Return vector of type __m128 with all elements set to zero.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
- FORCE_INLINE __m128 _mm_setzero_ps(void)
- {
-     return vreinterpretq_m128_f32(vdupq_n_f32(0));
-@@ -2641,7 +2408,7 @@ FORCE_INLINE __m128 _mm_setzero_ps(void)
- 
- // Shuffle 16-bit integers in a using the control in imm8, and store the results
- // in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
- #ifdef _sse2neon_shuffle
- #define _mm_shuffle_pi16(a, imm)                                           \
-     __extension__({                                                        \
-@@ -2775,19 +2542,17 @@ FORCE_INLINE void _mm_lfence(void)
-     })
- #endif
- 
--// Computes the approximations of square roots of the four single-precision,
--// floating-point values of a. First computes reciprocal square roots and then
--// reciprocals of the four values.
--//
--//   r0 := sqrt(a0)
--//   r1 := sqrt(a1)
--//   r2 := sqrt(a2)
--//   r3 := sqrt(a3)
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
-+// Compute the square root of packed single-precision (32-bit) floating-point
-+// elements in a, and store the results in dst.
-+// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement
-+// square root by multiplying input in with its reciprocal square root before
-+// using the Newton-Raphson method to approximate the results.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
- FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
- {
--#if SSE2NEON_PRECISE_SQRT
-+#if defined(__aarch64__)
-+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
-+#else
-     float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
- 
-     // Test for vrsqrteq_f32(0) -> positive infinity case.
-@@ -2798,28 +2563,23 @@ FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
-     recip = vreinterpretq_f32_u32(
-         vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
- 
--    // Additional Netwon-Raphson iteration for accuracy
-     recip = vmulq_f32(
-         vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-         recip);
-+    // Additional Netwon-Raphson iteration for accuracy
-     recip = vmulq_f32(
-         vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-         recip);
- 
-     // sqrt(s) = s * 1/sqrt(s)
-     return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
--#elif defined(__aarch64__)
--    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
--#else
--    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
--    float32x4_t sq = vrecpeq_f32(recipsq);
--    return vreinterpretq_m128_f32(sq);
- #endif
- }
- 
--// Computes the approximation of the square root of the scalar single-precision
--// floating point value of in.
--// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
-+// Compute the square root of the lower single-precision (32-bit) floating-point
-+// element in a, store the result in the lower element of dst, and copy the
-+// upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
- FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
- {
-     float32_t value =
-@@ -2828,8 +2588,10 @@ FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
-         vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
- }
- 
--// Stores four single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
-+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
-+// or a general-protection exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
- FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
- {
-     vst1q_f32(p, vreinterpretq_f32_m128(a));
-@@ -2838,21 +2600,16 @@ FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
- // Store the lower single-precision (32-bit) floating-point element from a into
- // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
- // boundary or a general-protection exception may be generated.
--//
--//   MEM[mem_addr+31:mem_addr] := a[31:0]
--//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
--//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
--//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
- FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
- {
-     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-     vst1q_f32(p, vdupq_n_f32(a0));
- }
- 
--// Stores the lower single - precision, floating - point value.
--// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
-+// Store the lower single-precision (32-bit) floating-point element from a into
-+// memory. mem_addr does not need to be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
- FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
- {
-     vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
-@@ -2861,34 +2618,20 @@ FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
- // Store the lower single-precision (32-bit) floating-point element from a into
- // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
- // boundary or a general-protection exception may be generated.
--//
--//   MEM[mem_addr+31:mem_addr] := a[31:0]
--//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
--//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
--//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
- #define _mm_store1_ps _mm_store_ps1
- 
--// Stores the upper two single-precision, floating-point values of a to the
--// address p.
--//
--//   *p0 := a2
--//   *p1 := a3
--//
--// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
-+// Store the upper 2 single-precision (32-bit) floating-point elements from a
-+// into memory.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
- FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
- {
-     *p = vreinterpret_m64_f32(vget_high_f32(a));
- }
- 
--// Stores the lower two single-precision floating point values of a to the
--// address p.
--//
--//   *p0 := a0
--//   *p1 := a1
--//
--// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
-+// Store the lower 2 single-precision (32-bit) floating-point elements from a
-+// into memory.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
- FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
- {
-     *p = vreinterpret_m64_f32(vget_low_f32(a));
-@@ -2897,13 +2640,7 @@ FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
- // Store 4 single-precision (32-bit) floating-point elements from a into memory
- // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
- // general-protection exception may be generated.
--//
--//   MEM[mem_addr+31:mem_addr] := a[127:96]
--//   MEM[mem_addr+63:mem_addr+32] := a[95:64]
--//   MEM[mem_addr+95:mem_addr+64] := a[63:32]
--//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
- FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
- {
-     float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
-@@ -2911,22 +2648,24 @@ FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
-     vst1q_f32(p, rev);
- }
- 
--// Stores four single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
-+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-+// elements) from a into memory. mem_addr does not need to be aligned on any
-+// particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
- FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
- {
-     vst1q_f32(p, vreinterpretq_f32_m128(a));
- }
- 
- // Stores 16-bits of integer data a at the address p.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
- FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
- {
-     vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
- }
- 
- // Stores 64-bits of integer data a at the address p.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
- FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
- {
-     vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
-@@ -2934,7 +2673,7 @@ FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
- 
- // Store 64-bits of integer data from a into memory using a non-temporal memory
- // hint.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
- FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
- {
-     vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
-@@ -2942,7 +2681,7 @@ FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
- 
- // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
- // point elements) from a into memory using a non-temporal memory hint.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
- FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
- {
- #if __has_builtin(__builtin_nontemporal_store)
-@@ -2952,14 +2691,10 @@ FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
- #endif
- }
- 
--// Subtracts the four single-precision, floating-point values of a and b.
--//
--//   r0 := a0 - b0
--//   r1 := a1 - b1
--//   r2 := a2 - b2
--//   r3 := a3 - b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
-+// Subtract packed single-precision (32-bit) floating-point elements in b from
-+// packed single-precision (32-bit) floating-point elements in a, and store the
-+// results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
- FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_f32(
-@@ -2970,11 +2705,7 @@ FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
- // the lower single-precision (32-bit) floating-point element in a, store the
- // result in the lower element of dst, and copy the upper 3 packed elements from
- // a to the upper elements of dst.
--//
--//   dst[31:0] := a[31:0] - b[31:0]
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
- FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_sub_ps(a, b));
-@@ -2983,7 +2714,7 @@ FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
- // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
- // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
- // transposed matrix in these vectors (row0 now contains column 0, etc.).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
- #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
-     do {                                                  \
-         float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
-@@ -3008,7 +2739,7 @@ FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
- #define _mm_ucomineq_ss _mm_comineq_ss
- 
- // Return vector of type __m128i with undefined elements.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
- FORCE_INLINE __m128i _mm_undefined_si128(void)
- {
- #if defined(__GNUC__) || defined(__clang__)
-@@ -3023,7 +2754,7 @@ FORCE_INLINE __m128i _mm_undefined_si128(void)
- }
- 
- // Return vector of type __m128 with undefined elements.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
- FORCE_INLINE __m128 _mm_undefined_ps(void)
- {
- #if defined(__GNUC__) || defined(__clang__)
-@@ -3037,15 +2768,9 @@ FORCE_INLINE __m128 _mm_undefined_ps(void)
- #endif
- }
- 
--// Selects and interleaves the upper two single-precision, floating-point values
--// from a and b.
--//
--//   r0 := a2
--//   r1 := b2
--//   r2 := a3
--//   r3 := b3
--//
--// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
-+// Unpack and interleave single-precision (32-bit) floating-point elements from
-+// the high half a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
- FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
- {
- #if defined(__aarch64__)
-@@ -3059,15 +2784,9 @@ FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
- #endif
- }
- 
--// Selects and interleaves the lower two single-precision, floating-point values
--// from a and b.
--//
--//   r0 := a0
--//   r1 := b0
--//   r2 := a1
--//   r3 := b1
--//
--// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
-+// Unpack and interleave single-precision (32-bit) floating-point elements from
-+// the low half of a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
- FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
- {
- #if defined(__aarch64__)
-@@ -3081,9 +2800,9 @@ FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
- #endif
- }
- 
--// Computes bitwise EXOR (exclusive-or) of the four single-precision,
--// floating-point values of a and b.
--// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
-+// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
-+// elements in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
- FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_s32(
-@@ -3092,42 +2811,32 @@ FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
- 
- /* SSE2 */
- 
--// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
--// unsigned 16-bit integers in b.
--// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
-+// Add packed 16-bit integers in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
- FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-         vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
- 
--// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
--// unsigned 32-bit integers in b.
--//
--//   r0 := a0 + b0
--//   r1 := a1 + b1
--//   r2 := a2 + b2
--//   r3 := a3 + b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-+// Add packed 32-bit integers in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
- FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-         vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
- 
--// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
--// unsigned 32-bit integers in b.
--// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-+// Add packed 64-bit integers in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
- FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s64(
-         vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
- }
- 
--// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
--// unsigned 8-bit integers in b.
--// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
-+// Add packed 8-bit integers in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
- FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s8(
-@@ -3136,7 +2845,7 @@ FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
- 
- // Add packed double-precision (64-bit) floating-point elements in a and b, and
- // store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
- FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3155,11 +2864,7 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
- // Add the lower double-precision (64-bit) floating-point element in a and b,
- // store the result in the lower element of dst, and copy the upper element from
- // a to the upper element of dst.
--//
--//   dst[63:0] := a[63:0] + b[63:0]
--//   dst[127:64] := a[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
- FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3175,25 +2880,16 @@ FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
- }
- 
- // Add 64-bit integers a and b, and store the result in dst.
--//
--//   dst[63:0] := a[63:0] + b[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
- FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_s64(
-         vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
- }
- 
--// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
--// and saturates.
--//
--//   r0 := SignedSaturate(a0 + b0)
--//   r1 := SignedSaturate(a1 + b1)
--//   ...
--//   r7 := SignedSaturate(a7 + b7)
--//
--// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
-+// Add packed signed 16-bit integers in a and b using saturation, and store the
-+// results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
- FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-@@ -3202,13 +2898,7 @@ FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
- 
- // Add packed signed 8-bit integers in a and b using saturation, and store the
- // results in dst.
--//
--//   FOR j := 0 to 15
--//     i := j*8
--//     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
- FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s8(
-@@ -3217,16 +2907,16 @@ FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
- 
- // Add packed unsigned 16-bit integers in a and b using saturation, and store
- // the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
- FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-         vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
- }
- 
--// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
--// b and saturates..
--// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
-+// Add packed unsigned 8-bit integers in a and b using saturation, and store the
-+// results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
- FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -3235,25 +2925,16 @@ FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
- 
- // Compute the bitwise AND of packed double-precision (64-bit) floating-point
- // elements in a and b, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*64
--//     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
- FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
- {
-     return vreinterpretq_m128d_s64(
-         vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
- }
- 
--// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
--// b.
--//
--//   r := a & b
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
-+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-+// and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
- FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-@@ -3262,13 +2943,7 @@ FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
- 
- // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
- // elements in a and then AND with b, and store the results in dst.
--//
--//   FOR j := 0 to 1
--// 	     i := j*64
--// 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
- FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
- {
-     // *NOTE* argument swap
-@@ -3276,12 +2951,9 @@ FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
-         vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
- }
- 
--// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
--// 128-bit value in a.
--//
--//   r := (~a) & b
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
-+// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
-+// AND with b, and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
- FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-@@ -3289,30 +2961,18 @@ FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
-                   vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
- }
- 
--// Computes the average of the 8 unsigned 16-bit integers in a and the 8
--// unsigned 16-bit integers in b and rounds.
--//
--//   r0 := (a0 + b0) / 2
--//   r1 := (a1 + b1) / 2
--//   ...
--//   r7 := (a7 + b7) / 2
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
-+// Average packed unsigned 16-bit integers in a and b, and store the results in
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
- FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
- {
-     return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
-                                  vreinterpretq_u16_m128i(b));
- }
- 
--// Computes the average of the 16 unsigned 8-bit integers in a and the 16
--// unsigned 8-bit integers in b and rounds.
--//
--//   r0 := (a0 + b0) / 2
--//   r1 := (a1 + b1) / 2
--//   ...
--//   r15 := (a15 + b15) / 2
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
-+// Average packed unsigned 8-bit integers in a and b, and store the results in
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
- FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -3321,17 +2981,17 @@ FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
- 
- // Shift a left by imm8 bytes while shifting in zeros, and store the results in
- // dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
- #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
- 
- // Shift a right by imm8 bytes while shifting in zeros, and store the results in
- // dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
- #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
- 
- // Cast vector of type __m128d to type __m128. This intrinsic is only used for
- // compilation and does not generate any instructions, thus it has zero latency.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
- FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
- {
-     return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
-@@ -3339,7 +2999,7 @@ FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
- 
- // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
- // compilation and does not generate any instructions, thus it has zero latency.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
- FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
- {
-     return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
-@@ -3347,15 +3007,15 @@ FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
- 
- // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
- // compilation and does not generate any instructions, thus it has zero latency.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
- FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
- {
-     return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
- }
- 
--// Applies a type cast to reinterpret four 32-bit floating point values passed
--// in as a 128-bit parameter as packed 32-bit integers.
--// https://msdn.microsoft.com/en-us/library/bb514099.aspx
-+// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
-+// compilation and does not generate any instructions, thus it has zero latency.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
- FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
- {
-     return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
-@@ -3363,7 +3023,7 @@ FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
- 
- // Cast vector of type __m128i to type __m128d. This intrinsic is only used for
- // compilation and does not generate any instructions, thus it has zero latency.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
- FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
- {
- #if defined(__aarch64__)
-@@ -3373,9 +3033,9 @@ FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
- #endif
- }
- 
--// Applies a type cast to reinterpret four 32-bit integers passed in as a
--// 128-bit parameter as packed 32-bit floating point values.
--// https://msdn.microsoft.com/en-us/library/bb514029.aspx
-+// Cast vector of type __m128i to type __m128. This intrinsic is only used for
-+// compilation and does not generate any instructions, thus it has zero latency.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
- FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
- {
-     return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
-@@ -3406,9 +3066,9 @@ FORCE_INLINE void _mm_clflush(void const *p)
- #endif
- }
- 
--// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
--// unsigned 16-bit integers in b for equality.
--// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
-+// Compare packed 16-bit integers in a and b for equality, and store the results
-+// in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
- FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-@@ -3416,16 +3076,17 @@ FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
- }
- 
- // Compare packed 32-bit integers in a and b for equality, and store the results
--// in dst
-+// in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
- FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u32(
-         vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
- 
--// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
--// unsigned 8-bit integers in b for equality.
--// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
-+// Compare packed 8-bit integers in a and b for equality, and store the results
-+// in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
- FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -3434,7 +3095,7 @@ FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for equality, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
- FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3452,7 +3113,7 @@ FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for equality, store the result in the lower element of dst, and copy the
- // upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
- FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
-@@ -3460,7 +3121,7 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for greater-than-or-equal, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
- FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3482,7 +3143,7 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for greater-than-or-equal, store the result in the lower element of dst,
- // and copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
- FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3500,39 +3161,27 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
- #endif
- }
- 
--// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
--// in b for greater than.
--//
--//   r0 := (a0 > b0) ? 0xffff : 0x0
--//   r1 := (a1 > b1) ? 0xffff : 0x0
--//   ...
--//   r7 := (a7 > b7) ? 0xffff : 0x0
--//
--// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
-+// Compare packed signed 16-bit integers in a and b for greater-than, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
- FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-         vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
- 
--// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
--// in b for greater than.
--// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
-+// Compare packed signed 32-bit integers in a and b for greater-than, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
- FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u32(
-         vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
- 
--// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
--// in b for greater than.
--//
--//   r0 := (a0 > b0) ? 0xff : 0x0
--//   r1 := (a1 > b1) ? 0xff : 0x0
--//   ...
--//   r15 := (a15 > b15) ? 0xff : 0x0
--//
--// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
-+// Compare packed signed 8-bit integers in a and b for greater-than, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
- FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -3541,7 +3190,7 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for greater-than, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
- FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3563,7 +3212,7 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for greater-than, store the result in the lower element of dst, and copy
- // the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
- FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3583,7 +3232,7 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for less-than-or-equal, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
- FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3605,7 +3254,7 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for less-than-or-equal, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
- FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3623,34 +3272,30 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
- #endif
- }
- 
--// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
--// in b for less than.
--//
--//   r0 := (a0 < b0) ? 0xffff : 0x0
--//   r1 := (a1 < b1) ? 0xffff : 0x0
--//   ...
--//   r7 := (a7 < b7) ? 0xffff : 0x0
--//
--// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
-+// Compare packed signed 16-bit integers in a and b for less-than, and store the
-+// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
-+// order of the operands switched.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
- FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-         vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
- 
--
--// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
--// in b for less than.
--// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
-+// Compare packed signed 32-bit integers in a and b for less-than, and store the
-+// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
-+// order of the operands switched.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
- FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u32(
-         vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
- 
--// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
--// in b for lesser than.
--// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
-+// Compare packed signed 8-bit integers in a and b for less-than, and store the
-+// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
-+// order of the operands switched.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
- FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -3659,7 +3304,7 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for less-than, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
- FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3681,7 +3326,7 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for less-than, store the result in the lower element of dst, and copy the
- // upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
- FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3700,7 +3345,7 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for not-equal, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
- FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3718,7 +3363,7 @@ FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for not-equal, store the result in the lower element of dst, and copy the
- // upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
- FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
-@@ -3726,7 +3371,7 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for not-greater-than-or-equal, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
- FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3751,7 +3396,7 @@ FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for not-greater-than-or-equal, store the result in the lower element of
- // dst, and copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
- FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
-@@ -3759,7 +3404,7 @@ FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for not-greater-than, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
- FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3784,7 +3429,7 @@ FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for not-greater-than, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
- FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
-@@ -3792,7 +3437,7 @@ FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for not-less-than-or-equal, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
- FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3817,7 +3462,7 @@ FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for not-less-than-or-equal, store the result in the lower element of dst,
- // and copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
- FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
-@@ -3825,7 +3470,7 @@ FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for not-less-than, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
- FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3850,7 +3495,7 @@ FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for not-less-than, store the result in the lower element of dst, and copy
- // the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
- FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
-@@ -3858,7 +3503,7 @@ FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // to see if neither is NaN, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
- FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3890,7 +3535,7 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b to see if neither is NaN, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
- FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3912,7 +3557,7 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // to see if either is NaN, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
- FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3945,7 +3590,7 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b to see if either is NaN, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
- FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3967,7 +3612,7 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
- 
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for greater-than-or-equal, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
- FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3982,7 +3627,7 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
- 
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for greater-than, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
- FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3997,7 +3642,7 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
- 
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for less-than-or-equal, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
- FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4012,7 +3657,7 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
- 
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for less-than, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
- FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4027,7 +3672,7 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
- 
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for equality, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
- FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4048,7 +3693,7 @@ FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
- 
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for not-equal, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
- FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
- {
-     return !_mm_comieq_sd(a, b);
-@@ -4056,14 +3701,7 @@ FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
- 
- // Convert packed signed 32-bit integers in a to packed double-precision
- // (64-bit) floating-point elements, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*32
--//     m := j*64
--//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
- FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
- {
- #if defined(__aarch64__)
-@@ -4076,9 +3714,9 @@ FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
- #endif
- }
- 
--// Converts the four signed 32-bit integer values of a to single-precision,
--// floating-point values
--// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
-+// Convert packed signed 32-bit integers in a to packed single-precision
-+// (32-bit) floating-point elements, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
- FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
- {
-     return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
-@@ -4086,14 +3724,7 @@ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
- 
- // Convert packed double-precision (64-bit) floating-point elements in a to
- // packed 32-bit integers, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//      i := 32*j
--//      k := 64*j
--//      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
- FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
- {
- // vrnd32xq_f64 not supported on clang
-@@ -4112,14 +3743,7 @@ FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
- 
- // Convert packed double-precision (64-bit) floating-point elements in a to
- // packed 32-bit integers, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//      i := 32*j
--//      k := 64*j
--//      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
- FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
- {
-     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-@@ -4132,15 +3756,7 @@ FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
- // Convert packed double-precision (64-bit) floating-point elements in a to
- // packed single-precision (32-bit) floating-point elements, and store the
- // results in dst.
--//
--//   FOR j := 0 to 1
--//     i := 32*j
--//     k := 64*j
--//     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
--//   ENDFOR
--//   dst[127:64] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
- FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -4155,14 +3771,7 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
- 
- // Convert packed signed 32-bit integers in a to packed double-precision
- // (64-bit) floating-point elements, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*32
--//     m := j*64
--//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
- FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
- {
- #if defined(__aarch64__)
-@@ -4175,15 +3784,9 @@ FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
- #endif
- }
- 
--// Converts the four single-precision, floating-point values of a to signed
--// 32-bit integer values.
--//
--//   r0 := (int) a0
--//   r1 := (int) a1
--//   r2 := (int) a2
--//   r3 := (int) a3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
-+// Convert packed single-precision (32-bit) floating-point elements in a to
-+// packed 32-bit integers, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
- // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
- // does not support! It is supported on ARMv8-A however.
- FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
-@@ -4240,14 +3843,7 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed double-precision (64-bit) floating-point elements, and store the
- // results in dst.
--//
--//   FOR j := 0 to 1
--//     i := 64*j
--//     k := 32*j
--//     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
- FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
- {
- #if defined(__aarch64__)
-@@ -4261,10 +3857,7 @@ FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
- }
- 
- // Copy the lower double-precision (64-bit) floating-point element of a to dst.
--//
--//   dst[63:0] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
- FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -4276,10 +3869,7 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
- 
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 32-bit integer, and store the result in dst.
--//
--//   dst[31:0] := Convert_FP64_To_Int32(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
- FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -4293,10 +3883,7 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
- 
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 64-bit integer, and store the result in dst.
--//
--//   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
- FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -4310,17 +3897,14 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
- 
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 64-bit integer, and store the result in dst.
--//
--//   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
- #define _mm_cvtsd_si64x _mm_cvtsd_si64
- 
- // Convert the lower double-precision (64-bit) floating-point element in b to a
- // single-precision (32-bit) floating-point element, store the result in the
- // lower element of dst, and copy the upper 3 packed elements from a to the
- // upper elements of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
- FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4334,33 +3918,27 @@ FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
- }
- 
- // Copy the lower 32-bit integer in a to dst.
--//
--//   dst[31:0] := a[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
- FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
- {
-     return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
- }
- 
- // Copy the lower 64-bit integer in a to dst.
--//
--//   dst[63:0] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
- FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
- {
-     return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
- }
- 
- // Copy the lower 64-bit integer in a to dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
- #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
- 
- // Convert the signed 32-bit integer b to a double-precision (64-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
- FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
- {
- #if defined(__aarch64__)
-@@ -4374,21 +3952,12 @@ FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
- }
- 
- // Copy the lower 64-bit integer in a to dst.
--//
--//   dst[63:0] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
- #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
- 
--// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
--// zero extending the upper bits.
--//
--//   r0 := a
--//   r1 := 0x0
--//   r2 := 0x0
--//   r3 := 0x0
--//
--// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
-+// Copy 32-bit integer a to the lower elements of dst, and zero the upper
-+// elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
- FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
- {
-     return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
-@@ -4397,7 +3966,7 @@ FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
- // Convert the signed 64-bit integer b to a double-precision (64-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
- FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
- {
- #if defined(__aarch64__)
-@@ -4410,11 +3979,9 @@ FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
- #endif
- }
- 
--// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
--// zero extending the upper bits.
--//
--//   r0 := a
--//   r1 := 0x0
-+// Copy 64-bit integer a to the lower element of dst, and zero the upper
-+// element.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
- FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
- {
-     return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
-@@ -4422,24 +3989,20 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
- 
- // Copy 64-bit integer a to the lower element of dst, and zero the upper
- // element.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
- #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
- 
- // Convert the signed 64-bit integer b to a double-precision (64-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
- #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
- 
- // Convert the lower single-precision (32-bit) floating-point element in b to a
- // double-precision (64-bit) floating-point element, store the result in the
- // lower element of dst, and copy the upper element from a to the upper element
- // of dst.
--//
--//   dst[63:0] := Convert_FP32_To_FP64(b[31:0])
--//   dst[127:64] := a[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
- FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
- {
-     double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-@@ -4454,7 +4017,7 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
- 
- // Convert packed double-precision (64-bit) floating-point elements in a to
- // packed 32-bit integers with truncation, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
- FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
- {
-     double a0 = ((double *) &a)[0];
-@@ -4464,7 +4027,7 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
- 
- // Convert packed double-precision (64-bit) floating-point elements in a to
- // packed 32-bit integers with truncation, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
- FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
- {
-     double a0 = ((double *) &a)[0];
-@@ -4473,9 +4036,9 @@ FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
-     return vreinterpret_m64_s32(vld1_s32(data));
- }
- 
--// Converts the four single-precision, floating-point values of a to signed
--// 32-bit integer values using truncate.
--// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
-+// Convert packed single-precision (32-bit) floating-point elements in a to
-+// packed 32-bit integers with truncation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
- FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
- {
-     return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
-@@ -4483,10 +4046,7 @@ FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
- 
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 32-bit integer with truncation, and store the result in dst.
--//
--//   dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
- FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
- {
-     double ret = *((double *) &a);
-@@ -4495,10 +4055,7 @@ FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
- 
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 64-bit integer with truncation, and store the result in dst.
--//
--//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
- FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -4511,21 +4068,12 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
- 
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 64-bit integer with truncation, and store the result in dst.
--//
--//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
- #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
- 
- // Divide packed double-precision (64-bit) floating-point elements in a by
- // packed elements in b, and store the results in dst.
--//
--//  FOR j := 0 to 1
--//    i := 64*j
--//    dst[i+63:i] := a[i+63:i] / b[i+63:i]
--//  ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
- FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4545,7 +4093,7 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
- // lower double-precision (64-bit) floating-point element in b, store the result
- // in the lower element of dst, and copy the upper element from a to the upper
- // element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
- FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4558,16 +4106,16 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
- #endif
- }
- 
--// Extracts the selected signed or unsigned 16-bit integer from a and zero
--// extends.
--// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
-+// Extract a 16-bit integer from a, selected with imm8, and store the result in
-+// the lower element of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
- // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
- #define _mm_extract_epi16(a, imm) \
-     vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
- 
--// Inserts the least significant 16 bits of b into the selected 16-bit integer
--// of a.
--// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
-+// Copy a to dst, and insert the 16-bit integer i into dst at the location
-+// specified by imm8.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
- // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
- //                                       __constrange(0,8) int imm)
- #define _mm_insert_epi16(a, b, imm)                                  \
-@@ -4576,12 +4124,10 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
-             vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
-     })
- 
--// Loads two double-precision from 16-byte aligned memory, floating-point
--// values.
--//
--//   dst[127:0] := MEM[mem_addr+127:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
-+// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
-+// boundary or a general-protection exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
- FORCE_INLINE __m128d _mm_load_pd(const double *p)
- {
- #if defined(__aarch64__)
-@@ -4595,21 +4141,13 @@ FORCE_INLINE __m128d _mm_load_pd(const double *p)
- 
- // Load a double-precision (64-bit) floating-point element from memory into both
- // elements of dst.
--//
--//   dst[63:0] := MEM[mem_addr+63:mem_addr]
--//   dst[127:64] := MEM[mem_addr+63:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
- #define _mm_load_pd1 _mm_load1_pd
- 
- // Load a double-precision (64-bit) floating-point element from memory into the
- // lower of dst, and zero the upper element. mem_addr does not need to be
- // aligned on any particular boundary.
--//
--//   dst[63:0] := MEM[mem_addr+63:mem_addr]
--//   dst[127:64] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
- FORCE_INLINE __m128d _mm_load_sd(const double *p)
- {
- #if defined(__aarch64__)
-@@ -4621,8 +4159,9 @@ FORCE_INLINE __m128d _mm_load_sd(const double *p)
- #endif
- }
- 
--// Loads 128-bit value. :
--// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
-+// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
-+// on a 16-byte boundary or a general-protection exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
- FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
- {
-     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-@@ -4630,11 +4169,7 @@ FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
- 
- // Load a double-precision (64-bit) floating-point element from memory into both
- // elements of dst.
--//
--//   dst[63:0] := MEM[mem_addr+63:mem_addr]
--//   dst[127:64] := MEM[mem_addr+63:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
- FORCE_INLINE __m128d _mm_load1_pd(const double *p)
- {
- #if defined(__aarch64__)
-@@ -4647,11 +4182,7 @@ FORCE_INLINE __m128d _mm_load1_pd(const double *p)
- // Load a double-precision (64-bit) floating-point element from memory into the
- // upper element of dst, and copy the lower element from a to dst. mem_addr does
- // not need to be aligned on any particular boundary.
--//
--//   dst[63:0] := a[63:0]
--//   dst[127:64] := MEM[mem_addr+63:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
- FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
- {
- #if defined(__aarch64__)
-@@ -4664,7 +4195,7 @@ FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
- }
- 
- // Load 64-bit integer from memory into the first element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
- FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
- {
-     /* Load the lower 64 bits of the value pointed to by p into the
-@@ -4677,11 +4208,7 @@ FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
- // Load a double-precision (64-bit) floating-point element from memory into the
- // lower element of dst, and copy the upper element from a to dst. mem_addr does
- // not need to be aligned on any particular boundary.
--//
--//   dst[63:0] := MEM[mem_addr+63:mem_addr]
--//   dst[127:64] := a[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
- FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
- {
- #if defined(__aarch64__)
-@@ -4697,11 +4224,7 @@ FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
- // Load 2 double-precision (64-bit) floating-point elements from memory into dst
- // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
- // general-protection exception may be generated.
--//
--//   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
--//   dst[127:64] := MEM[mem_addr+63:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
- FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
- {
- #if defined(__aarch64__)
-@@ -4714,39 +4237,32 @@ FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
- }
- 
- // Loads two double-precision from unaligned memory, floating-point values.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
- FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
- {
-     return _mm_load_pd(p);
- }
- 
--// Loads 128-bit value. :
--// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
-+// Load 128-bits of integer data from memory into dst. mem_addr does not need to
-+// be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
- FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
- {
-     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
- }
- 
- // Load unaligned 32-bit integer from memory into the first element of dst.
--//
--//   dst[31:0] := MEM[mem_addr+31:mem_addr]
--//   dst[MAX:32] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
- FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
- {
-     return vreinterpretq_m128i_s32(
-         vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
- }
- 
--// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
--// integers from b.
--//
--//   r0 := (a0 * b0) + (a1 * b1)
--//   r1 := (a2 * b2) + (a3 * b3)
--//   r2 := (a4 * b4) + (a5 * b5)
--//   r3 := (a6 * b6) + (a7 * b7)
--// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
-+// Multiply packed signed 16-bit integers in a and b, producing intermediate
-+// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
-+// 32-bit integers, and pack the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
- FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
- {
-     int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-@@ -4771,7 +4287,7 @@ FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
- // (elements are not stored when the highest bit is not set in the corresponding
- // element) and a non-temporal memory hint. mem_addr does not need to be aligned
- // on any particular boundary.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
- FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
- {
-     int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
-@@ -4782,18 +4298,18 @@ FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
-     vst1q_s8((int8_t *) mem_addr, masked);
- }
- 
--// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
--// signed 16-bit integers from b.
--// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
-+// Compare packed signed 16-bit integers in a and b, and store packed maximum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
- FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-         vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
- 
--// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
--// 16 unsigned 8-bit integers from b.
--// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
-+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
- FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -4802,7 +4318,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b,
- // and store packed maximum values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
- FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4830,7 +4346,7 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b, store the maximum value in the lower element of dst, and copy the upper
- // element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
- FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4843,18 +4359,18 @@ FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
- #endif
- }
- 
--// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
--// signed 16-bit integers from b.
--// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
-+// Compare packed signed 16-bit integers in a and b, and store packed minimum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
- FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-         vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
- 
--// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
--// 16 unsigned 8-bit integers from b.
--// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
-+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
- FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -4863,7 +4379,7 @@ FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b,
- // and store packed minimum values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
- FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4890,7 +4406,7 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b, store the minimum value in the lower element of dst, and copy the upper
- // element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
- FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4905,11 +4421,7 @@ FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
- 
- // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
- // upper element.
--//
--//   dst[63:0] := a[63:0]
--//   dst[127:64] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
- FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
- {
-     return vreinterpretq_m128i_s64(
-@@ -4919,11 +4431,7 @@ FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
- // Move the lower double-precision (64-bit) floating-point element from b to the
- // lower element of dst, and copy the upper element from a to the upper element
- // of dst.
--//
--//   dst[63:0] := b[63:0]
--//   dst[127:64] := a[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
- FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
- {
-     return vreinterpretq_m128d_f32(
-@@ -4931,10 +4439,9 @@ FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
-                      vget_high_f32(vreinterpretq_f32_m128d(a))));
- }
- 
--// NEON does not provide a version of this function.
--// Creates a 16-bit mask from the most significant bits of the 16 signed or
--// unsigned 8-bit integers in a and zero extends the upper bits.
--// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
-+// Create mask from the most significant bit of each 8-bit element in a, and
-+// store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
- FORCE_INLINE int _mm_movemask_epi8(__m128i a)
- {
-     // Use increasingly wide shifts+adds to collect the sign bits
-@@ -5017,7 +4524,7 @@ FORCE_INLINE int _mm_movemask_epi8(__m128i a)
- 
- // Set each bit of mask dst based on the most significant bit of the
- // corresponding packed double-precision (64-bit) floating-point element in a.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
- FORCE_INLINE int _mm_movemask_pd(__m128d a)
- {
-     uint64x2_t input = vreinterpretq_u64_m128d(a);
-@@ -5026,10 +4533,7 @@ FORCE_INLINE int _mm_movemask_pd(__m128d a)
- }
- 
- // Copy the lower 64-bit integer in a to dst.
--//
--//   dst[63:0] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
- FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
- {
-     return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
-@@ -5037,11 +4541,7 @@ FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
- 
- // Copy the 64-bit integer a to the lower element of dst, and zero the upper
- // element.
--//
--//   dst[63:0] := a[63:0]
--//   dst[127:64] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
- FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
- {
-     return vreinterpretq_m128i_s64(
-@@ -5050,9 +4550,7 @@ FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
- 
- // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
- // a and b, and store the unsigned 64-bit results in dst.
--//
--//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
--//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
- FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
- {
-     // vmull_u32 upcasts instead of masking, so we downcast.
-@@ -5063,7 +4561,7 @@ FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
- 
- // Multiply packed double-precision (64-bit) floating-point elements in a and b,
- // and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
- FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -5082,7 +4580,7 @@ FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
- // Multiply the lower double-precision (64-bit) floating-point element in a and
- // b, store the result in the lower element of dst, and copy the upper element
- // from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
- FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_mul_pd(a, b));
-@@ -5090,25 +4588,17 @@ FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
- 
- // Multiply the low unsigned 32-bit integers from a and b, and store the
- // unsigned 64-bit result in dst.
--//
--//   dst[63:0] := a[31:0] * b[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
- FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_u64(vget_low_u64(
-         vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
- }
- 
--// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
--// integers from b.
--//
--//   r0 := (a0 * b0)[31:16]
--//   r1 := (a1 * b1)[31:16]
--//   ...
--//   r7 := (a7 * b7)[31:16]
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
-+// Multiply the packed signed 16-bit integers in a and b, producing intermediate
-+// 32-bit integers, and store the high 16 bits of the intermediate integers in
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
- FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
- {
-     /* FIXME: issue with large values because of result saturation */
-@@ -5129,7 +4619,7 @@ FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
- // Multiply the packed unsigned 16-bit integers in a and b, producing
- // intermediate 32-bit integers, and store the high 16 bits of the intermediate
- // integers in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
- FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
- {
-     uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
-@@ -5151,15 +4641,9 @@ FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
- #endif
- }
- 
--// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
--// unsigned 16-bit integers from b.
--//
--//   r0 := (a0 * b0)[15:0]
--//   r1 := (a1 * b1)[15:0]
--//   ...
--//   r7 := (a7 * b7)[15:0]
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
-+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
-+// integers, and store the low 16 bits of the intermediate integers in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
- FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-@@ -5168,27 +4652,25 @@ FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
- 
- // Compute the bitwise OR of packed double-precision (64-bit) floating-point
- // elements in a and b, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
- FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
- {
-     return vreinterpretq_m128d_s64(
-         vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
- }
- 
--// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
--//
--//   r := a | b
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
-+// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
-+// and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
- FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-         vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
- 
--// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
--// saturates.
--// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
-+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
-+// using signed saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
- FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s8(
-@@ -5196,19 +4678,9 @@ FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
-                     vqmovn_s16(vreinterpretq_s16_m128i(b))));
- }
- 
--// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
--// and saturates.
--//
--//   r0 := SignedSaturate(a0)
--//   r1 := SignedSaturate(a1)
--//   r2 := SignedSaturate(a2)
--//   r3 := SignedSaturate(a3)
--//   r4 := SignedSaturate(b0)
--//   r5 := SignedSaturate(b1)
--//   r6 := SignedSaturate(b2)
--//   r7 := SignedSaturate(b3)
--//
--// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
-+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
-+// using signed saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
- FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-@@ -5216,19 +4688,9 @@ FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
-                      vqmovn_s32(vreinterpretq_s32_m128i(b))));
- }
- 
--// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
--// integers and saturates.
--//
--//   r0 := UnsignedSaturate(a0)
--//   r1 := UnsignedSaturate(a1)
--//   ...
--//   r7 := UnsignedSaturate(a7)
--//   r8 := UnsignedSaturate(b0)
--//   r9 := UnsignedSaturate(b1)
--//   ...
--//   r15 := UnsignedSaturate(b7)
--//
--// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
-+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
-+// using unsigned saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
- FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -5241,6 +4703,7 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
- // 'yield' instruction isn't a good fit because it's effectively a nop on most
- // Arm cores. Experience with several databases has shown has shown an 'isb' is
- // a reasonable approximation.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
- FORCE_INLINE void _mm_pause()
- {
-     __asm__ __volatile__("isb\n");
-@@ -5250,15 +4713,15 @@ FORCE_INLINE void _mm_pause()
- // b, then horizontally sum each consecutive 8 differences to produce two
- // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
- // 16 bits of 64-bit elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
- FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
- {
-     uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
-     return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
- }
- 
--// Sets the 8 signed 16-bit integer values.
--// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
-+// Set packed 16-bit integers in dst with the supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
- FORCE_INLINE __m128i _mm_set_epi16(short i7,
-                                    short i6,
-                                    short i5,
-@@ -5272,33 +4735,31 @@ FORCE_INLINE __m128i _mm_set_epi16(short i7,
-     return vreinterpretq_m128i_s16(vld1q_s16(data));
- }
- 
--// Sets the 4 signed 32-bit integer values.
--// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
-+// Set packed 32-bit integers in dst with the supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
- FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
- {
-     int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
-     return vreinterpretq_m128i_s32(vld1q_s32(data));
- }
- 
--// Returns the __m128i structure with its two 64-bit integer values
--// initialized to the values of the two 64-bit integers passed in.
--// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
-+// Set packed 64-bit integers in dst with the supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
- FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
- {
-     return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
- }
- 
--// Returns the __m128i structure with its two 64-bit integer values
--// initialized to the values of the two 64-bit integers passed in.
--// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
-+// Set packed 64-bit integers in dst with the supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
- FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
- {
-     return vreinterpretq_m128i_s64(
-         vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
- }
- 
--// Sets the 16 signed 8-bit integer values.
--// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
-+// Set packed 8-bit integers in dst with the supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
- FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
-                                   signed char b14,
-                                   signed char b13,
-@@ -5326,7 +4787,7 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
- 
- // Set packed double-precision (64-bit) floating-point elements in dst with the
- // supplied values.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
- FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
- {
-     double ALIGN_STRUCT(16) data[2] = {e0, e1};
-@@ -5339,12 +4800,12 @@ FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
- 
- // Broadcast double-precision (64-bit) floating-point value a to all elements of
- // dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
- #define _mm_set_pd1 _mm_set1_pd
- 
- // Copy double-precision (64-bit) floating-point element a to the lower element
- // of dst, and zero the upper element.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
- FORCE_INLINE __m128d _mm_set_sd(double a)
- {
- #if defined(__aarch64__)
-@@ -5354,54 +4815,36 @@ FORCE_INLINE __m128d _mm_set_sd(double a)
- #endif
- }
- 
--// Sets the 8 signed 16-bit integer values to w.
--//
--//   r0 := w
--//   r1 := w
--//   ...
--//   r7 := w
--//
--// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
-+// Broadcast 16-bit integer a to all all elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
- FORCE_INLINE __m128i _mm_set1_epi16(short w)
- {
-     return vreinterpretq_m128i_s16(vdupq_n_s16(w));
- }
- 
--// Sets the 4 signed 32-bit integer values to i.
--//
--//   r0 := i
--//   r1 := i
--//   r2 := i
--//   r3 := I
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
-+// Broadcast 32-bit integer a to all elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
- FORCE_INLINE __m128i _mm_set1_epi32(int _i)
- {
-     return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
- }
- 
--// Sets the 2 signed 64-bit integer values to i.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
-+// Broadcast 64-bit integer a to all elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
- FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
- {
-     return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
- }
- 
--// Sets the 2 signed 64-bit integer values to i.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
-+// Broadcast 64-bit integer a to all elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
- FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
- {
-     return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
- }
- 
--// Sets the 16 signed 8-bit integer values to b.
--//
--//   r0 := b
--//   r1 := b
--//   ...
--//   r15 := b
--//
--// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
-+// Broadcast 8-bit integer a to all elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
- FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
- {
-     return vreinterpretq_m128i_s8(vdupq_n_s8(w));
-@@ -5409,7 +4852,7 @@ FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
- 
- // Broadcast double-precision (64-bit) floating-point value a to all elements of
- // dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
- FORCE_INLINE __m128d _mm_set1_pd(double d)
- {
- #if defined(__aarch64__)
-@@ -5419,13 +4862,8 @@ FORCE_INLINE __m128d _mm_set1_pd(double d)
- #endif
- }
- 
--// Sets the 8 signed 16-bit integer values in reverse order.
--//
--// Return Value
--//   r0 := w0
--//   r1 := w1
--//   ...
--//   r7 := w7
-+// Set packed 16-bit integers in dst with the supplied values in reverse order.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
- FORCE_INLINE __m128i _mm_setr_epi16(short w0,
-                                     short w1,
-                                     short w2,
-@@ -5439,8 +4877,8 @@ FORCE_INLINE __m128i _mm_setr_epi16(short w0,
-     return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
- }
- 
--// Sets the 4 signed 32-bit integer values in reverse order
--// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
-+// Set packed 32-bit integers in dst with the supplied values in reverse order.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
- FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
- {
-     int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
-@@ -5448,14 +4886,14 @@ FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
- }
- 
- // Set packed 64-bit integers in dst with the supplied values in reverse order.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
- FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
- {
-     return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
- }
- 
--// Sets the 16 signed 8-bit integer values in reverse order.
--// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
-+// Set packed 8-bit integers in dst with the supplied values in reverse order.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
- FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
-                                    signed char b1,
-                                    signed char b2,
-@@ -5483,14 +4921,14 @@ FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
- 
- // Set packed double-precision (64-bit) floating-point elements in dst with the
- // supplied values in reverse order.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
- FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
- {
-     return _mm_set_pd(e0, e1);
- }
- 
- // Return vector of type __m128d with all elements set to zero.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
- FORCE_INLINE __m128d _mm_setzero_pd(void)
- {
- #if defined(__aarch64__)
-@@ -5500,15 +4938,16 @@ FORCE_INLINE __m128d _mm_setzero_pd(void)
- #endif
- }
- 
--// Sets the 128-bit value to zero
--// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
-+// Return vector of type __m128i with all elements set to zero.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
- FORCE_INLINE __m128i _mm_setzero_si128(void)
- {
-     return vreinterpretq_m128i_s32(vdupq_n_s32(0));
- }
- 
--// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
--// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
-+// Shuffle 32-bit integers in a using the control in imm8, and store the results
-+// in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
- // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
- //                                        __constrange(0,255) int imm)
- #ifdef _sse2neon_shuffle
-@@ -5577,11 +5016,7 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
- 
- // Shuffle double-precision (64-bit) floating-point elements using the control
- // in imm8, and store the results in dst.
--//
--//   dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
--//   dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
- #ifdef _sse2neon_shuffle
- #define _mm_shuffle_pd(a, b, imm8)                                            \
-     vreinterpretq_m128d_s64(                                                  \
-@@ -5627,17 +5062,7 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
- 
- // Shift packed 16-bit integers in a left by count while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*16
--//     IF count[63:0] > 15
--//       dst[i+15:i] := 0
--//     ELSE
--//       dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
- FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
- {
-     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5650,17 +5075,7 @@ FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
- 
- // Shift packed 32-bit integers in a left by count while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*32
--//     IF count[63:0] > 31
--//       dst[i+31:i] := 0
--//     ELSE
--//       dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
- FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
- {
-     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5673,17 +5088,7 @@ FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
- 
- // Shift packed 64-bit integers in a left by count while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*64
--//     IF count[63:0] > 63
--//       dst[i+63:i] := 0
--//     ELSE
--//       dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
- FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
- {
-     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5696,17 +5101,7 @@ FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
- 
- // Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*16
--//     IF imm8[7:0] > 15
--//       dst[i+15:i] := 0
--//     ELSE
--//       dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
- FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
- {
-     if (_sse2neon_unlikely(imm & ~15))
-@@ -5717,17 +5112,7 @@ FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
- 
- // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*32
--//     IF imm8[7:0] > 31
--//       dst[i+31:i] := 0
--//     ELSE
--//       dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
- FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
- {
-     if (_sse2neon_unlikely(imm & ~31))
-@@ -5738,17 +5123,7 @@ FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
- 
- // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*64
--//     IF imm8[7:0] > 63
--//       dst[i+63:i] := 0
--//     ELSE
--//       dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
- FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
- {
-     if (_sse2neon_unlikely(imm & ~63))
-@@ -5759,14 +5134,7 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
- 
- // Shift a left by imm8 bytes while shifting in zeros, and store the results in
- // dst.
--//
--//   tmp := imm8[7:0]
--//   IF tmp > 15
--//     tmp := 16
--//   FI
--//   dst[127:0] := a[127:0] << (tmp*8)
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
- #define _mm_slli_si128(a, imm)                                         \
-     __extension__({                                                    \
-         int8x16_t ret;                                                 \
-@@ -5782,7 +5150,7 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
- 
- // Compute the square root of packed double-precision (64-bit) floating-point
- // elements in a, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
- FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -5797,7 +5165,7 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
- // Compute the square root of the lower double-precision (64-bit) floating-point
- // element in b, store the result in the lower element of dst, and copy the
- // upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
- FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -5809,17 +5177,7 @@ FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
- 
- // Shift packed 16-bit integers in a right by count while shifting in sign bits,
- // and store the results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*16
--//     IF count[63:0] > 15
--//       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
--//     ELSE
--//       dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0])
--//     FI
--//  ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
- FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
- {
-     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-@@ -5830,17 +5188,7 @@ FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
- 
- // Shift packed 32-bit integers in a right by count while shifting in sign bits,
- // and store the results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*32
--//     IF count[63:0] > 31
--//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
--//     ELSE
--//       dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0])
--//     FI
--//  ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
- FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
- {
-     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-@@ -5851,17 +5199,7 @@ FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
- 
- // Shift packed 16-bit integers in a right by imm8 while shifting in sign
- // bits, and store the results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*16
--//     IF imm8[7:0] > 15
--//       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
--//     ELSE
--//       dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
- FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
- {
-     const int count = (imm & ~15) ? 15 : imm;
-@@ -5870,17 +5208,7 @@ FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
- 
- // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
- // and store the results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*32
--//     IF imm8[7:0] > 31
--//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
--//     ELSE
--//       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
- // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
- #define _mm_srai_epi32(a, imm)                                               \
-     __extension__({                                                          \
-@@ -5899,17 +5227,7 @@ FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
- 
- // Shift packed 16-bit integers in a right by count while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*16
--//     IF count[63:0] > 15
--//       dst[i+15:i] := 0
--//     ELSE
--//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
- FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
- {
-     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5922,17 +5240,7 @@ FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
- 
- // Shift packed 32-bit integers in a right by count while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*32
--//     IF count[63:0] > 31
--//       dst[i+31:i] := 0
--//     ELSE
--//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
- FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
- {
-     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5945,17 +5253,7 @@ FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
- 
- // Shift packed 64-bit integers in a right by count while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*64
--//     IF count[63:0] > 63
--//       dst[i+63:i] := 0
--//     ELSE
--//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
- FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
- {
-     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5968,17 +5266,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
- 
- // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*16
--//     IF imm8[7:0] > 15
--//       dst[i+15:i] := 0
--//     ELSE
--//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
- #define _mm_srli_epi16(a, imm)                                               \
-     __extension__({                                                          \
-         __m128i ret;                                                         \
-@@ -5993,17 +5281,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
- 
- // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*32
--//     IF imm8[7:0] > 31
--//       dst[i+31:i] := 0
--//     ELSE
--//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
- // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
- #define _mm_srli_epi32(a, imm)                                               \
-     __extension__({                                                          \
-@@ -6019,17 +5297,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
- 
- // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*64
--//     IF imm8[7:0] > 63
--//       dst[i+63:i] := 0
--//     ELSE
--//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
- #define _mm_srli_epi64(a, imm)                                               \
-     __extension__({                                                          \
-         __m128i ret;                                                         \
-@@ -6044,14 +5312,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
- 
- // Shift a right by imm8 bytes while shifting in zeros, and store the results in
- // dst.
--//
--//   tmp := imm8[7:0]
--//   IF tmp > 15
--//     tmp := 16
--//   FI
--//   dst[127:0] := a[127:0] >> (tmp*8)
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
- #define _mm_srli_si128(a, imm)                                       \
-     __extension__({                                                  \
-         int8x16_t ret;                                               \
-@@ -6066,7 +5327,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
- // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
- // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
- // or a general-protection exception may be generated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
- FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
- {
- #if defined(__aarch64__)
-@@ -6079,7 +5340,7 @@ FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
- // Store the lower double-precision (64-bit) floating-point element from a into
- // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
- // boundary or a general-protection exception may be generated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
- FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
- {
- #if defined(__aarch64__)
-@@ -6095,7 +5356,7 @@ FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
- 
- // Store the lower double-precision (64-bit) floating-point element from a into
- // memory. mem_addr does not need to be aligned on any particular boundary.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
- FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
- {
- #if defined(__aarch64__)
-@@ -6105,8 +5366,9 @@ FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
- #endif
- }
- 
--// Stores four 32-bit integer values as (as a __m128i value) at the address p.
--// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-+// Store 128-bits of integer data from a into memory. mem_addr must be aligned
-+// on a 16-byte boundary or a general-protection exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
- FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
- {
-     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-@@ -6115,15 +5377,12 @@ FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
- // Store the lower double-precision (64-bit) floating-point element from a into
- // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
- // boundary or a general-protection exception may be generated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
- #define _mm_store1_pd _mm_store_pd1
- 
- // Store the upper double-precision (64-bit) floating-point element from a into
- // memory.
--//
--//   MEM[mem_addr+63:mem_addr] := a[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
- FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
- {
- #if defined(__aarch64__)
-@@ -6133,8 +5392,8 @@ FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
- #endif
- }
- 
--// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
--// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
-+// Store 64-bit integer from the first element of a into memory.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
- FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
- {
-     vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
-@@ -6142,10 +5401,7 @@ FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
- 
- // Store the lower double-precision (64-bit) floating-point element from a into
- // memory.
--//
--//   MEM[mem_addr+63:mem_addr] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
- FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
- {
- #if defined(__aarch64__)
-@@ -6158,11 +5414,7 @@ FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
- // Store 2 double-precision (64-bit) floating-point elements from a into memory
- // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
- // general-protection exception may be generated.
--//
--//   MEM[mem_addr+63:mem_addr] := a[127:64]
--//   MEM[mem_addr+127:mem_addr+64] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
- FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
- {
-     float32x4_t f = vreinterpretq_f32_m128d(a);
-@@ -6172,21 +5424,23 @@ FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
- // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
- // elements) from a into memory. mem_addr does not need to be aligned on any
- // particular boundary.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
- FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
- {
-     _mm_store_pd(mem_addr, a);
- }
- 
--// Stores 128-bits of integer data a at the address p.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128
-+// Store 128-bits of integer data from a into memory. mem_addr does not need to
-+// be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
- FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
- {
-     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
- }
- 
--// Stores 32-bits of integer data a at the address p.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32
-+// Store 32-bit integer from the first element of a into memory. mem_addr does
-+// not need to be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
- FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
- {
-     vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
-@@ -6196,7 +5450,7 @@ FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
- // elements) from a into memory using a non-temporal memory hint. mem_addr must
- // be aligned on a 16-byte boundary or a general-protection exception may be
- // generated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
- FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
- {
- #if __has_builtin(__builtin_nontemporal_store)
-@@ -6208,10 +5462,10 @@ FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
- #endif
- }
- 
--// Stores the data in a to the address p without polluting the caches.  If the
--// cache line containing address p is already in the cache, the cache will be
--// updated.
--// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
-+// Store 128-bits of integer data from a into memory using a non-temporal memory
-+// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
-+// exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
- FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
- {
- #if __has_builtin(__builtin_nontemporal_store)
-@@ -6224,7 +5478,7 @@ FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
- // Store 32-bit integer a into memory using a non-temporal hint to minimize
- // cache pollution. If the cache line containing address mem_addr is already in
- // the cache, the cache will be updated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
- FORCE_INLINE void _mm_stream_si32(int *p, int a)
- {
-     vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
-@@ -6233,7 +5487,7 @@ FORCE_INLINE void _mm_stream_si32(int *p, int a)
- // Store 64-bit integer a into memory using a non-temporal hint to minimize
- // cache pollution. If the cache line containing address mem_addr is already in
- // the cache, the cache will be updated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
- FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
- {
-     vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
-@@ -6241,32 +5495,25 @@ FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
- 
- // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
- // store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
- FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-         vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
- 
--// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
--// unsigned 32-bit integers of a.
--//
--//   r0 := a0 - b0
--//   r1 := a1 - b1
--//   r2 := a2 - b2
--//   r3 := a3 - b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
-+// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
- FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-         vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
- 
--// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
--// and store the results in dst.
--//    r0 := a0 - b0
--//    r1 := a1 - b1
-+// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
- FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s64(
-@@ -6275,7 +5522,7 @@ FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
- 
- // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
- // store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
- FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s8(
-@@ -6285,13 +5532,7 @@ FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
- // Subtract packed double-precision (64-bit) floating-point elements in b from
- // packed double-precision (64-bit) floating-point elements in a, and store the
- // results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*64
--//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
--//   ENDFOR
--//
--//  https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
-+//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
- FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -6311,71 +5552,50 @@ FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
- // the lower double-precision (64-bit) floating-point element in a, store the
- // result in the lower element of dst, and copy the upper element from a to the
- // upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
- FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_sub_pd(a, b));
- }
- 
- // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
--//
--//   dst[63:0] := a[63:0] - b[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
- FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_s64(
-         vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
- }
- 
--// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
--// of a and saturates.
--//
--//   r0 := SignedSaturate(a0 - b0)
--//   r1 := SignedSaturate(a1 - b1)
--//   ...
--//   r7 := SignedSaturate(a7 - b7)
--//
--// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
-+// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
-+// using saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
- FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-         vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
- 
--// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
--// of a and saturates.
--//
--//   r0 := SignedSaturate(a0 - b0)
--//   r1 := SignedSaturate(a1 - b1)
--//   ...
--//   r15 := SignedSaturate(a15 - b15)
--//
--// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
-+// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
-+// using saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
- FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s8(
-         vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
- }
- 
--// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
--// integers of a and saturates..
--// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
-+// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
-+// integers in a using saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
- FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-         vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
- }
- 
--// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
--// integers of a and saturates.
--//
--//   r0 := UnsignedSaturate(a0 - b0)
--//   r1 := UnsignedSaturate(a1 - b1)
--//   ...
--//   r15 := UnsignedSaturate(a15 - b15)
--//
--// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
-+// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
-+// integers in a using saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
- FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -6390,7 +5610,7 @@ FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
- #define _mm_ucomineq_sd _mm_comineq_sd
- 
- // Return vector of type __m128d with undefined elements.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
- FORCE_INLINE __m128d _mm_undefined_pd(void)
- {
- #if defined(__GNUC__) || defined(__clang__)
-@@ -6404,19 +5624,9 @@ FORCE_INLINE __m128d _mm_undefined_pd(void)
- #endif
- }
- 
--// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
--// upper 4 signed or unsigned 16-bit integers in b.
--//
--//   r0 := a4
--//   r1 := b4
--//   r2 := a5
--//   r3 := b5
--//   r4 := a6
--//   r5 := b6
--//   r6 := a7
--//   r7 := b7
--//
--// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
-+// Unpack and interleave 16-bit integers from the high half of a and b, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
- FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6430,9 +5640,9 @@ FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
- #endif
- }
- 
--// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
--// upper 2 signed or unsigned 32-bit integers in b.
--// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
-+// Unpack and interleave 32-bit integers from the high half of a and b, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
- FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6446,30 +5656,24 @@ FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
- #endif
- }
- 
--// Interleaves the upper signed or unsigned 64-bit integer in a with the
--// upper signed or unsigned 64-bit integer in b.
--//
--//   r0 := a1
--//   r1 := b1
-+// Unpack and interleave 64-bit integers from the high half of a and b, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
- FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
- {
-+#if defined(__aarch64__)
-+    return vreinterpretq_m128i_s64(
-+        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-+#else
-     int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
-     int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
-     return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
-+#endif
- }
- 
--// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
--// 8 signed or unsigned 8-bit integers in b.
--//
--//   r0 := a8
--//   r1 := b8
--//   r2 := a9
--//   r3 := b9
--//   ...
--//   r14 := a15
--//   r15 := b15
--//
--// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
-+// Unpack and interleave 8-bit integers from the high half of a and b, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
- FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6487,15 +5691,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
- 
- // Unpack and interleave double-precision (64-bit) floating-point elements from
- // the high half of a and b, and store the results in dst.
--//
--//   DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
--//     dst[63:0] := src1[127:64]
--//     dst[127:64] := src2[127:64]
--//     RETURN dst[127:0]
--//   }
--//   dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
- FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -6508,19 +5704,9 @@ FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
- #endif
- }
- 
--// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
--// lower 4 signed or unsigned 16-bit integers in b.
--//
--//   r0 := a0
--//   r1 := b0
--//   r2 := a1
--//   r3 := b1
--//   r4 := a2
--//   r5 := b2
--//   r6 := a3
--//   r7 := b3
--//
--// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
-+// Unpack and interleave 16-bit integers from the low half of a and b, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
- FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6534,15 +5720,9 @@ FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
- #endif
- }
- 
--// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
--// lower 2 signed or unsigned 32 - bit integers in b.
--//
--//   r0 := a0
--//   r1 := b0
--//   r2 := a1
--//   r3 := b1
--//
--// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
-+// Unpack and interleave 32-bit integers from the low half of a and b, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
- FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6556,25 +5736,24 @@ FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
- #endif
- }
- 
-+// Unpack and interleave 64-bit integers from the low half of a and b, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
- FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
- {
-+#if defined(__aarch64__)
-+    return vreinterpretq_m128i_s64(
-+        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-+#else
-     int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
-     int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
-     return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
-+#endif
- }
- 
--// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
--// 8 signed or unsigned 8-bit integers in b.
--//
--//   r0 := a0
--//   r1 := b0
--//   r2 := a1
--//   r3 := b1
--//   ...
--//   r14 := a7
--//   r15 := b7
--//
--// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
-+// Unpack and interleave 8-bit integers from the low half of a and b, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
- FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6590,15 +5769,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
- 
- // Unpack and interleave double-precision (64-bit) floating-point elements from
- // the low half of a and b, and store the results in dst.
--//
--//   DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
--//     dst[63:0] := src1[63:0]
--//     dst[127:64] := src2[63:0]
--//     RETURN dst[127:0]
--//   }
--//   dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
- FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -6613,21 +5784,16 @@ FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
- 
- // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
- // elements in a and b, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//      i := j*64
--//      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
- FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
- {
-     return vreinterpretq_m128d_s64(
-         veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
- }
- 
--// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
--// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
-+// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
-+// and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
- FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-@@ -6639,17 +5805,7 @@ FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
- // Alternatively add and subtract packed double-precision (64-bit)
- // floating-point elements in a to/from packed elements in b, and store the
- // results in dst.
--//
--// FOR j := 0 to 1
--//   i := j*64
--//   IF ((j & 1) == 0)
--//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
--//   ELSE
--//     dst[i+63:i] := a[i+63:i] + b[i+63:i]
--//   FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
- FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
- {
-     _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
-@@ -6665,7 +5821,7 @@ FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
- // Alternatively add and subtract packed single-precision (32-bit)
- // floating-point elements in a to/from packed elements in b, and store the
- // results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
- FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
- {
-     _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
-@@ -6680,7 +5836,7 @@ FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
- 
- // Horizontally add adjacent pairs of double-precision (64-bit) floating-point
- // elements in a and b, and pack the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
- FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -6694,9 +5850,9 @@ FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
- #endif
- }
- 
--// Computes pairwise add of each argument as single-precision, floating-point
--// values a and b.
--// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
-+// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
-+// elements in a and b, and pack the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
- FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
- {
- #if defined(__aarch64__)
-@@ -6714,7 +5870,7 @@ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
- 
- // Horizontally subtract adjacent pairs of double-precision (64-bit)
- // floating-point elements in a and b, and pack the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
- FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
- {
- #if defined(__aarch64__)
-@@ -6732,7 +5888,7 @@ FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
- 
- // Horizontally subtract adjacent pairs of single-precision (32-bit)
- // floating-point elements in a and b, and pack the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
- FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
- {
-     float32x4_t a = vreinterpretq_f32_m128(_a);
-@@ -6749,24 +5905,17 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
- // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
- // may perform better than _mm_loadu_si128 when the data crosses a cache line
- // boundary.
--//
--//   dst[127:0] := MEM[mem_addr+127:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
- #define _mm_lddqu_si128 _mm_loadu_si128
- 
- // Load a double-precision (64-bit) floating-point element from memory into both
- // elements of dst.
--//
--//   dst[63:0] := MEM[mem_addr+63:mem_addr]
--//   dst[127:64] := MEM[mem_addr+63:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
- #define _mm_loaddup_pd _mm_load1_pd
- 
- // Duplicate the low double-precision (64-bit) floating-point element from a,
- // and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
- FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -6780,7 +5929,7 @@ FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
- 
- // Duplicate odd-indexed single-precision (32-bit) floating-point elements
- // from a, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
- FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
- {
- #if defined(__aarch64__)
-@@ -6799,7 +5948,7 @@ FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
- 
- // Duplicate even-indexed single-precision (32-bit) floating-point elements
- // from a, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
- FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
- {
- #if defined(__aarch64__)
-@@ -6820,13 +5969,7 @@ FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
- 
- // Compute the absolute value of packed signed 16-bit integers in a, and store
- // the unsigned results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*16
--//     dst[i+15:i] := ABS(a[i+15:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
- FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
- {
-     return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
-@@ -6834,13 +5977,7 @@ FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
- 
- // Compute the absolute value of packed signed 32-bit integers in a, and store
- // the unsigned results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*32
--//     dst[i+31:i] := ABS(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
- FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
- {
-     return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
-@@ -6848,13 +5985,7 @@ FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
- 
- // Compute the absolute value of packed signed 8-bit integers in a, and store
- // the unsigned results in dst.
--//
--//   FOR j := 0 to 15
--//     i := j*8
--//     dst[i+7:i] := ABS(a[i+7:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
- FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
- {
-     return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
-@@ -6862,13 +5993,7 @@ FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
- 
- // Compute the absolute value of packed signed 16-bit integers in a, and store
- // the unsigned results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*16
--//     dst[i+15:i] := ABS(a[i+15:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
- FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
- {
-     return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
-@@ -6876,13 +6001,7 @@ FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
- 
- // Compute the absolute value of packed signed 32-bit integers in a, and store
- // the unsigned results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*32
--//     dst[i+31:i] := ABS(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
- FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
- {
-     return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
-@@ -6890,13 +6009,7 @@ FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
- 
- // Compute the absolute value of packed signed 8-bit integers in a, and store
- // the unsigned results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*8
--//     dst[i+7:i] := ABS(a[i+7:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
- FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
- {
-     return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
-@@ -6904,11 +6017,7 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
- 
- // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
- // the result right by imm8 bytes, and store the low 16 bytes in dst.
--//
--//   tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
--//   dst[127:0] := tmp[127:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
- #define _mm_alignr_epi8(a, b, imm)                                            \
-     __extension__({                                                           \
-         uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
-@@ -6926,11 +6035,7 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
- 
- // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
- // the result right by imm8 bytes, and store the low 8 bytes in dst.
--//
--//   tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
--//   dst[63:0] := tmp[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
- #define _mm_alignr_pi8(a, b, imm)                                           \
-     __extension__({                                                         \
-         __m64 ret;                                                          \
-@@ -6953,8 +6058,9 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
-         ret;                                                                \
-     })
- 
--// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
--// values a and b.
-+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
-+// signed 16-bit results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
- FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
- {
-     int16x8_t a = vreinterpretq_s16_m128i(_a);
-@@ -6968,8 +6074,9 @@ FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
- #endif
- }
- 
--// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
--// values a and b.
-+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
-+// signed 32-bit results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
- FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
- {
-     int32x4_t a = vreinterpretq_s32_m128i(_a);
-@@ -6985,7 +6092,7 @@ FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
- 
- // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
- // signed 16-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
- FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_s16(
-@@ -6994,15 +6101,16 @@ FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
- 
- // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
- // signed 32-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
- FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_s32(
-         vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
- }
- 
--// Computes saturated pairwise sub of each argument as a 16-bit signed
--// integer values a and b.
-+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
-+// saturation, and pack the signed 16-bit results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
- FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
- {
- #if defined(__aarch64__)
-@@ -7025,7 +6133,7 @@ FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
- 
- // Horizontally add adjacent pairs of signed 16-bit integers in a and b using
- // saturation, and pack the signed 16-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
- FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
- {
-     int16x4_t a = vreinterpret_s16_m64(_a);
-@@ -7040,7 +6148,7 @@ FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
- 
- // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
- // the signed 16-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
- FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
- {
-     int16x8_t a = vreinterpretq_s16_m128i(_a);
-@@ -7056,7 +6164,7 @@ FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
- 
- // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
- // the signed 32-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
- FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
- {
-     int32x4_t a = vreinterpretq_s32_m128i(_a);
-@@ -7072,7 +6180,7 @@ FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
- 
- // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
- // the signed 16-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
- FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
- {
-     int16x4_t a = vreinterpret_s16_m64(_a);
-@@ -7087,7 +6195,7 @@ FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
- 
- // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
- // the signed 32-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
- FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
- {
-     int32x2_t a = vreinterpret_s32_m64(_a);
-@@ -7100,9 +6208,9 @@ FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
- #endif
- }
- 
--// Computes saturated pairwise difference of each argument as a 16-bit signed
--// integer values a and b.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
-+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
-+// using saturation, and pack the signed 16-bit results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
- FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
- {
-     int16x8_t a = vreinterpretq_s16_m128i(_a);
-@@ -7118,7 +6226,7 @@ FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
- 
- // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
- // using saturation, and pack the signed 16-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
- FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
- {
-     int16x4_t a = vreinterpret_s16_m64(_a);
-@@ -7135,12 +6243,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
- // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
- // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
- // and pack the saturated results in dst.
--//
--//   FOR j := 0 to 7
--//      i := j*16
--//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
--//      a[i+7:i]*b[i+7:i] )
--//   ENDFOR
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
- FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
- {
- #if defined(__aarch64__)
-@@ -7179,7 +6282,7 @@ FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
- // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
- // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
- // pack the saturated results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
- FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
- {
-     uint16x4_t a = vreinterpret_u16_m64(_a);
-@@ -7204,12 +6307,7 @@ FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
- // Multiply packed signed 16-bit integers in a and b, producing intermediate
- // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
- // the packed 16-bit integers in dst.
--//
--//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
--//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
--//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
--//   ...
--//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
- FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
- {
-     // Has issues due to saturation
-@@ -7233,7 +6331,7 @@ FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
- // Multiply packed signed 16-bit integers in a and b, producing intermediate
- // signed 32-bit integers. Truncate each intermediate integer to the 18 most
- // significant bits, round by adding 1, and store bits [16:1] to dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
- FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
- {
-     int32x4_t mul_extend =
-@@ -7245,7 +6343,7 @@ FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
- 
- // Shuffle packed 8-bit integers in a according to shuffle control mask in the
- // corresponding 8-bit element of b, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
- FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
- {
-     int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
-@@ -7275,18 +6373,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
- 
- // Shuffle packed 8-bit integers in a according to shuffle control mask in the
- // corresponding 8-bit element of b, and store the results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*8
--//     IF b[i+7] == 1
--//       dst[i+7:i] := 0
--//     ELSE
--//       index[2:0] := b[i+2:i]
--//       dst[i+7:i] := a[index*8+7:index*8]
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
- FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
- {
-     const int8x8_t controlMask =
-@@ -7299,16 +6386,7 @@ FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
- // 16-bit integer in b is negative, and store the results in dst.
- // Element in dst are zeroed out when the corresponding element
- // in b is zero.
--//
--//   for i in 0..7
--//     if b[i] < 0
--//       r[i] := -a[i]
--//     else if b[i] == 0
--//       r[i] := 0
--//     else
--//       r[i] := a[i]
--//     fi
--//   done
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
- FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
- {
-     int16x8_t a = vreinterpretq_s16_m128i(_a);
-@@ -7336,16 +6414,7 @@ FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
- // 32-bit integer in b is negative, and store the results in dst.
- // Element in dst are zeroed out when the corresponding element
- // in b is zero.
--//
--//   for i in 0..3
--//     if b[i] < 0
--//       r[i] := -a[i]
--//     else if b[i] == 0
--//       r[i] := 0
--//     else
--//       r[i] := a[i]
--//     fi
--//   done
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
- FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
- {
-     int32x4_t a = vreinterpretq_s32_m128i(_a);
-@@ -7374,16 +6443,7 @@ FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
- // 8-bit integer in b is negative, and store the results in dst.
- // Element in dst are zeroed out when the corresponding element
- // in b is zero.
--//
--//   for i in 0..15
--//     if b[i] < 0
--//       r[i] := -a[i]
--//     else if b[i] == 0
--//       r[i] := 0
--//     else
--//       r[i] := a[i]
--//     fi
--//   done
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
- FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
- {
-     int8x16_t a = vreinterpretq_s8_m128i(_a);
-@@ -7412,19 +6472,7 @@ FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
- // Negate packed 16-bit integers in a when the corresponding signed 16-bit
- // integer in b is negative, and store the results in dst. Element in dst are
- // zeroed out when the corresponding element in b is zero.
--//
--//   FOR j := 0 to 3
--//      i := j*16
--//      IF b[i+15:i] < 0
--//        dst[i+15:i] := -(a[i+15:i])
--//      ELSE IF b[i+15:i] == 0
--//        dst[i+15:i] := 0
--//      ELSE
--//        dst[i+15:i] := a[i+15:i]
--//      FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
- FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
- {
-     int16x4_t a = vreinterpret_s16_m64(_a);
-@@ -7453,19 +6501,7 @@ FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
- // Negate packed 32-bit integers in a when the corresponding signed 32-bit
- // integer in b is negative, and store the results in dst. Element in dst are
- // zeroed out when the corresponding element in b is zero.
--//
--//   FOR j := 0 to 1
--//      i := j*32
--//      IF b[i+31:i] < 0
--//        dst[i+31:i] := -(a[i+31:i])
--//      ELSE IF b[i+31:i] == 0
--//        dst[i+31:i] := 0
--//      ELSE
--//        dst[i+31:i] := a[i+31:i]
--//      FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
- FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
- {
-     int32x2_t a = vreinterpret_s32_m64(_a);
-@@ -7494,19 +6530,7 @@ FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
- // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
- // in b is negative, and store the results in dst. Element in dst are zeroed out
- // when the corresponding element in b is zero.
--//
--//   FOR j := 0 to 7
--//      i := j*8
--//      IF b[i+7:i] < 0
--//        dst[i+7:i] := -(a[i+7:i])
--//      ELSE IF b[i+7:i] == 0
--//        dst[i+7:i] := 0
--//      ELSE
--//        dst[i+7:i] := a[i+7:i]
--//      FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
- FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
- {
-     int8x8_t a = vreinterpret_s8_m64(_a);
-@@ -7536,15 +6560,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
- 
- // Blend packed 16-bit integers from a and b using control mask imm8, and store
- // the results in dst.
--//
--//   FOR j := 0 to 7
--//       i := j*16
--//       IF imm8[j]
--//           dst[i+15:i] := b[i+15:i]
--//       ELSE
--//           dst[i+15:i] := a[i+15:i]
--//       FI
--//   ENDFOR
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
- // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
- //                                      __constrange(0,255) int imm)
- #define _mm_blend_epi16(a, b, imm)                                            \
-@@ -7565,7 +6581,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
- 
- // Blend packed double-precision (64-bit) floating-point elements from a and b
- // using control mask imm8, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
- #define _mm_blend_pd(a, b, imm)                                \
-     __extension__({                                            \
-         const uint64_t _mask[2] = {                            \
-@@ -7579,7 +6595,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
- 
- // Blend packed single-precision (32-bit) floating-point elements from a and b
- // using mask, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
- FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
- {
-     const uint32_t ALIGN_STRUCT(16)
-@@ -7595,15 +6611,7 @@ FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
- 
- // Blend packed 8-bit integers from a and b using mask, and store the results in
- // dst.
--//
--//   FOR j := 0 to 15
--//       i := j*8
--//       IF mask[i+7]
--//           dst[i+7:i] := b[i+7:i]
--//       ELSE
--//           dst[i+7:i] := a[i+7:i]
--//       FI
--//   ENDFOR
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
- FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
- {
-     // Use a signed shift right to create a mask with the sign bit
-@@ -7616,7 +6624,7 @@ FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
- 
- // Blend packed double-precision (64-bit) floating-point elements from a and b
- // using mask, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
- FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
- {
-     uint64x2_t mask =
-@@ -7634,7 +6642,7 @@ FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
- 
- // Blend packed single-precision (32-bit) floating-point elements from a and b
- // using mask, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
- FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
- {
-     // Use a signed shift right to create a mask with the sign bit
-@@ -7648,7 +6656,7 @@ FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
- // Round the packed double-precision (64-bit) floating-point elements in a up
- // to an integer value, and store the results as packed double-precision
- // floating-point elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
- FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -7662,7 +6670,7 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
- // Round the packed single-precision (32-bit) floating-point elements in a up to
- // an integer value, and store the results as packed single-precision
- // floating-point elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
- FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
- {
- #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-@@ -7677,7 +6685,7 @@ FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
- // an integer value, store the result as a double-precision floating-point
- // element in the lower element of dst, and copy the upper element from a to the
- // upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
- FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_ceil_pd(b));
-@@ -7687,11 +6695,7 @@ FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
- // an integer value, store the result as a single-precision floating-point
- // element in the lower element of dst, and copy the upper 3 packed elements
- // from a to the upper elements of dst.
--//
--//   dst[31:0] := CEIL(b[31:0])
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
- FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_ceil_ps(b));
-@@ -7714,16 +6718,18 @@ FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
- #endif
- }
- 
--// Converts the four signed 16-bit integers in the lower 64 bits to four signed
--// 32-bit integers.
-+// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
- FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
- {
-     return vreinterpretq_m128i_s32(
-         vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
- }
- 
--// Converts the two signed 16-bit integers in the lower 32 bits two signed
--// 32-bit integers.
-+// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
- FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
- {
-     int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-@@ -7732,16 +6738,18 @@ FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
-     return vreinterpretq_m128i_s64(s64x2);
- }
- 
--// Converts the two signed 32-bit integers in the lower 64 bits to two signed
--// 64-bit integers.
-+// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
- FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
- {
-     return vreinterpretq_m128i_s64(
-         vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
- }
- 
--// Converts the four unsigned 8-bit integers in the lower 16 bits to four
--// unsigned 32-bit integers.
-+// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
- FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
- {
-     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
-@@ -7749,8 +6757,9 @@ FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
-     return vreinterpretq_m128i_s16(s16x8);
- }
- 
--// Converts the four unsigned 8-bit integers in the lower 32 bits to four
--// unsigned 32-bit integers.
-+// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
- FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
- {
-     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-@@ -7759,8 +6768,9 @@ FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
-     return vreinterpretq_m128i_s32(s32x4);
- }
- 
--// Converts the two signed 8-bit integers in the lower 32 bits to four
--// signed 64-bit integers.
-+// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
-+// integers, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
- FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
- {
-     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-@@ -7770,16 +6780,18 @@ FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
-     return vreinterpretq_m128i_s64(s64x2);
- }
- 
--// Converts the four unsigned 16-bit integers in the lower 64 bits to four
--// unsigned 32-bit integers.
-+// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
-+// and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
- FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
- {
-     return vreinterpretq_m128i_u32(
-         vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
- }
- 
--// Converts the two unsigned 16-bit integers in the lower 32 bits to two
--// unsigned 64-bit integers.
-+// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
-+// and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
- FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
- {
-     uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-@@ -7788,8 +6800,9 @@ FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
-     return vreinterpretq_m128i_u64(u64x2);
- }
- 
--// Converts the two unsigned 32-bit integers in the lower 64 bits to two
--// unsigned 64-bit integers.
-+// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
-+// and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
- FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
- {
-     return vreinterpretq_m128i_u64(
-@@ -7798,7 +6811,7 @@ FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
- 
- // Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
- // and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
- FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
- {
-     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
-@@ -7806,9 +6819,9 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
-     return vreinterpretq_m128i_u16(u16x8);
- }
- 
--// Converts the four unsigned 8-bit integers in the lower 32 bits to four
--// unsigned 32-bit integers.
--// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
-+// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
-+// and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
- FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
- {
-     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-@@ -7817,8 +6830,9 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
-     return vreinterpretq_m128i_u32(u32x4);
- }
- 
--// Converts the two unsigned 8-bit integers in the lower 16 bits to two
--// unsigned 64-bit integers.
-+// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed
-+// 64-bit integers, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
- FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
- {
-     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-@@ -7831,7 +6845,7 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
- // Conditionally multiply the packed double-precision (64-bit) floating-point
- // elements in a and b using the high 4 bits in imm8, sum the four products, and
- // conditionally store the sum in dst using the low 4 bits of imm8.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
- FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
- {
-     // Generate mask value from constant immediate bit value
-@@ -7877,7 +6891,7 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
- // Conditionally multiply the packed single-precision (32-bit) floating-point
- // elements in a and b using the high 4 bits in imm8, sum the four products,
- // and conditionally store the sum in dst using the low 4 bits of imm.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
- FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
- {
- #if defined(__aarch64__)
-@@ -7918,22 +6932,24 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
-     return vreinterpretq_m128_f32(res);
- }
- 
--// Extracts the selected signed or unsigned 32-bit integer from a and zero
--// extends.
-+// Extract a 32-bit integer from a, selected with imm8, and store the result in
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
- // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
- #define _mm_extract_epi32(a, imm) \
-     vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
- 
--// Extracts the selected signed or unsigned 64-bit integer from a and zero
--// extends.
-+// Extract a 64-bit integer from a, selected with imm8, and store the result in
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
- // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
- #define _mm_extract_epi64(a, imm) \
-     vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
- 
--// Extracts the selected signed or unsigned 8-bit integer from a and zero
--// extends.
--// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8
-+// Extract an 8-bit integer from a, selected with imm8, and store the result in
-+// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
-+// __constrange(0,16) int imm)
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
- #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
- 
- // Extracts the selected single-precision (32-bit) floating-point from a.
-@@ -7943,7 +6959,7 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
- // Round the packed double-precision (64-bit) floating-point elements in a down
- // to an integer value, and store the results as packed double-precision
- // floating-point elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
- FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -7957,7 +6973,7 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
- // Round the packed single-precision (32-bit) floating-point elements in a down
- // to an integer value, and store the results as packed single-precision
- // floating-point elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
- FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
- {
- #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-@@ -7972,7 +6988,7 @@ FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
- // an integer value, store the result as a double-precision floating-point
- // element in the lower element of dst, and copy the upper element from a to the
- // upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
- FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_floor_pd(b));
-@@ -7982,18 +6998,15 @@ FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
- // an integer value, store the result as a single-precision floating-point
- // element in the lower element of dst, and copy the upper 3 packed elements
- // from a to the upper elements of dst.
--//
--//   dst[31:0] := FLOOR(b[31:0])
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
- FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_floor_ps(b));
- }
- 
--// Inserts the least significant 32 bits of b into the selected 32-bit integer
--// of a.
-+// Copy a to dst, and insert the 32-bit integer i into dst at the location
-+// specified by imm8.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
- // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
- //                                       __constrange(0,4) int imm)
- #define _mm_insert_epi32(a, b, imm)                                  \
-@@ -8002,8 +7015,9 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
-             vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
-     })
- 
--// Inserts the least significant 64 bits of b into the selected 64-bit integer
--// of a.
-+// Copy a to dst, and insert the 64-bit integer i into dst at the location
-+// specified by imm8.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
- // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
- //                                       __constrange(0,2) int imm)
- #define _mm_insert_epi64(a, b, imm)                                  \
-@@ -8012,8 +7026,9 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
-             vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
-     })
- 
--// Inserts the least significant 8 bits of b into the selected 8-bit integer
--// of a.
-+// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
-+// location specified by imm8.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
- // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
- //                                      __constrange(0,16) int imm)
- #define _mm_insert_epi8(a, b, imm)                                 \
-@@ -8025,7 +7040,7 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
- // Copy a to tmp, then insert a single-precision (32-bit) floating-point
- // element from b into tmp using the control in imm8. Store tmp to dst using
- // the mask in imm8 (elements are zeroed out when the corresponding bit is set).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
- #define _mm_insert_ps(a, b, imm8)                                              \
-     __extension__({                                                            \
-         float32x4_t tmp1 =                                                     \
-@@ -8045,17 +7060,9 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
-             vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
-     })
- 
--// epi versions of min/max
--// Computes the pariwise maximums of the four signed 32-bit integer values of a
--// and b.
--//
--// A 128-bit parameter that can be defined with the following equations:
--//   r0 := (a0 > b0) ? a0 : b0
--//   r1 := (a1 > b1) ? a1 : b1
--//   r2 := (a2 > b2) ? a2 : b2
--//   r3 := (a3 > b3) ? a3 : b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
-+// Compare packed signed 32-bit integers in a and b, and store packed maximum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
- FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-@@ -8064,7 +7071,7 @@ FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
- 
- // Compare packed signed 8-bit integers in a and b, and store packed maximum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
- FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s8(
-@@ -8073,7 +7080,7 @@ FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
- 
- // Compare packed unsigned 16-bit integers in a and b, and store packed maximum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
- FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-@@ -8082,23 +7089,16 @@ FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
- 
- // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
- FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u32(
-         vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
- }
- 
--// Computes the pariwise minima of the four signed 32-bit integer values of a
--// and b.
--//
--// A 128-bit parameter that can be defined with the following equations:
--//   r0 := (a0 < b0) ? a0 : b0
--//   r1 := (a1 < b1) ? a1 : b1
--//   r2 := (a2 < b2) ? a2 : b2
--//   r3 := (a3 < b3) ? a3 : b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
-+// Compare packed signed 32-bit integers in a and b, and store packed minimum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
- FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-@@ -8107,7 +7107,7 @@ FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
- 
- // Compare packed signed 8-bit integers in a and b, and store packed minimum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
- FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s8(
-@@ -8116,7 +7116,7 @@ FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
- 
- // Compare packed unsigned 16-bit integers in a and b, and store packed minimum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
- FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-@@ -8125,7 +7125,7 @@ FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
- 
- // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
- FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u32(
-@@ -8134,21 +7134,7 @@ FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
- 
- // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
- // in a, store the minimum and index in dst, and zero the remaining bits in dst.
--//
--//   index[2:0] := 0
--//   min[15:0] := a[15:0]
--//   FOR j := 0 to 7
--//       i := j*16
--//       IF a[i+15:i] < min[15:0]
--//           index[2:0] := j
--//           min[15:0] := a[i+15:i]
--//       FI
--//   ENDFOR
--//   dst[15:0] := min[15:0]
--//   dst[18:16] := index[2:0]
--//   dst[127:19] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
- FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
- {
-     __m128i dst;
-@@ -8198,7 +7184,7 @@ FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
- // quadruplets from a. One quadruplet is selected from b starting at on the
- // offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
- // integers selected from a starting at the offset specified in imm8.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
- FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
- {
-     uint8x16_t _a, _b;
-@@ -8278,9 +7264,7 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
- 
- // Multiply the low signed 32-bit integers from each packed 64-bit element in
- // a and b, and store the signed 64-bit results in dst.
--//
--//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
--//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
- FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
- {
-     // vmull_s32 upcasts instead of masking, so we downcast.
-@@ -8289,26 +7273,18 @@ FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
-     return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
- }
- 
--// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
--// unsigned 32-bit integers from b.
--// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
-+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
-+// integers, and store the low 32 bits of the intermediate integers in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
- FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-         vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
- 
--// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
--// integers and saturates.
--//
--//   r0 := UnsignedSaturate(a0)
--//   r1 := UnsignedSaturate(a1)
--//   r2 := UnsignedSaturate(a2)
--//   r3 := UnsignedSaturate(a3)
--//   r4 := UnsignedSaturate(b0)
--//   r5 := UnsignedSaturate(b1)
--//   r6 := UnsignedSaturate(b2)
--//   r7 := UnsignedSaturate(b3)
-+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
-+// using unsigned saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
- FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-@@ -8319,7 +7295,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
- // Round the packed double-precision (64-bit) floating-point elements in a using
- // the rounding parameter, and store the results as packed double-precision
- // floating-point elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
- FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
- {
- #if defined(__aarch64__)
-@@ -8448,7 +7424,7 @@ FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
- // the rounding parameter, store the result as a double-precision floating-point
- // element in the lower element of dst, and copy the upper element from a to the
- // upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
- FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
- {
-     return _mm_move_sd(a, _mm_round_pd(b, rounding));
-@@ -8468,7 +7444,7 @@ FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
- //     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
- //     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
- //     _MM_SET_ROUNDING_MODE
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
- FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
- {
-     return _mm_move_ss(a, _mm_round_ps(b, rounding));
-@@ -8477,10 +7453,7 @@ FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
- // Load 128-bits of integer data from memory into dst using a non-temporal
- // memory hint. mem_addr must be aligned on a 16-byte boundary or a
- // general-protection exception may be generated.
--//
--//   dst[127:0] := MEM[mem_addr+127:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
- FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
- {
- #if __has_builtin(__builtin_nontemporal_store)
-@@ -8492,7 +7465,7 @@ FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
- 
- // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
- // all 1's, and return 1 if the result is zero, otherwise return 0.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
- FORCE_INLINE int _mm_test_all_ones(__m128i a)
- {
-     return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
-@@ -8501,7 +7474,7 @@ FORCE_INLINE int _mm_test_all_ones(__m128i a)
- 
- // Compute the bitwise AND of 128 bits (representing integer data) in a and
- // mask, and return 1 if the result is zero, otherwise return 0.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
- FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
- {
-     int64x2_t a_and_mask =
-@@ -8514,7 +7487,7 @@ FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
- // the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
- // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
- // otherwise return 0.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
- FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
- {
-     uint64x2_t zf =
-@@ -8529,7 +7502,7 @@ FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
- // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
- // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
- // otherwise set CF to 0. Return the CF value.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
- FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
- {
-     int64x2_t s64 =
-@@ -8542,14 +7515,14 @@ FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
- // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
- // otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
- // otherwise return 0.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
- #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
- 
- // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
- // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
- // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
- // otherwise set CF to 0. Return the ZF value.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
- FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
- {
-     int64x2_t s64 =
-@@ -9028,7 +8001,7 @@ FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
- FORCE_INLINE int _sse2neon_clz(unsigned int x)
- {
- #if _MSC_VER
--    DWORD cnt = 0;
-+    unsigned long cnt = 0;
-     if (_BitScanForward(&cnt, x))
-         return cnt;
-     return 32;
-@@ -9040,7 +8013,7 @@ FORCE_INLINE int _sse2neon_clz(unsigned int x)
- FORCE_INLINE int _sse2neon_ctz(unsigned int x)
- {
- #if _MSC_VER
--    DWORD cnt = 0;
-+    unsigned long cnt = 0;
-     if (_BitScanReverse(&cnt, x))
-         return 31 - cnt;
-     return 32;
-@@ -9053,18 +8026,16 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
- {
- #if _MSC_VER
-     unsigned long cnt;
--#ifdef defined(SSE2NEON_HAS_BITSCAN64)
--    (defined(_M_AMD64) || defined(__x86_64__))
--        if((_BitScanForward64(&cnt, x))
--            return (int)(cnt);
-+#if defined(SSE2NEON_HAS_BITSCAN64)
-+    if ((_BitScanForward64(&cnt, x))
-+        return (int)(cnt);
- #else
-     if (_BitScanForward(&cnt, (unsigned long) (x)))
-         return (int) cnt;
-     if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
-         return (int) (cnt + 32);
--#endif
--    return 64;
--#else
-+#endif /* SSE2NEON_HAS_BITSCAN64 */
-+#else  /* assume GNU compatible compilers */
-     return x != 0 ? __builtin_ctzll(x) : 64;
- #endif
- }
-@@ -9155,7 +8126,7 @@ FORCE_INLINE int _mm_cmpestrc(__m128i a,
- 
- // Compare packed strings in a and b with lengths la and lb using the control
- // in imm8, and store the generated index in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestri
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
- FORCE_INLINE int _mm_cmpestri(__m128i a,
-                               int la,
-                               __m128i b,
-@@ -9168,7 +8139,7 @@ FORCE_INLINE int _mm_cmpestri(__m128i a,
- 
- // Compare packed strings in a and b with lengths la and lb using the control
- // in imm8, and store the generated mask in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrm
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
- FORCE_INLINE __m128i
- _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
- {
-@@ -9324,8 +8295,8 @@ FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
- }
- 
- // Starting with the initial value in crc, accumulates a CRC32 value for
--// unsigned 16-bit integer v.
--// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
-+// unsigned 16-bit integer v, and stores the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
- FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
- {
- #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-@@ -9342,8 +8313,8 @@ FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
- }
- 
- // Starting with the initial value in crc, accumulates a CRC32 value for
--// unsigned 32-bit integer v.
--// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
-+// unsigned 32-bit integer v, and stores the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
- FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
- {
- #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-@@ -9360,8 +8331,8 @@ FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
- }
- 
- // Starting with the initial value in crc, accumulates a CRC32 value for
--// unsigned 64-bit integer v.
--// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
-+// unsigned 64-bit integer v, and stores the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
- FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
- {
- #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-@@ -9376,8 +8347,8 @@ FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
- }
- 
- // Starting with the initial value in crc, accumulates a CRC32 value for
--// unsigned 8-bit integer v.
--// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
-+// unsigned 8-bit integer v, and stores the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
- FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
- {
- #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-@@ -9486,43 +8457,61 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
- 
- /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
- #define SSE2NEON_AES_H0(x) (x)
--static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
--static const uint8_t SSE2NEON_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
-+static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
-+static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
- #undef SSE2NEON_AES_H0
- 
--// In the absence of crypto extensions, implement aesenc using regular neon
-+/* x_time function and matrix multiply function */
-+#if !defined(__aarch64__)
-+#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
-+#define SSE2NEON_MULTIPLY(x, y)                                  \
-+    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
-+     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
-+     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
-+     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
-+#endif
-+
-+// In the absence of crypto extensions, implement aesenc using regular NEON
- // intrinsics instead. See:
- // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
- // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
--// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
--// for more information Reproduced with permission of the author.
-+// for more information.
- FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
- {
- #if defined(__aarch64__)
--    static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
--                                         0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
--                                         0xc, 0x1, 0x6, 0xb};
--    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
--                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
-+    static const uint8_t shift_rows[] = {
-+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
-+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
-+    };
-+    static const uint8_t ror32by8[] = {
-+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-+    };
- 
-     uint8x16_t v;
-     uint8x16_t w = vreinterpretq_u8_m128i(a);
- 
--    // shift rows
-+    /* shift rows */
-     w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
- 
--    // sub bytes
--    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
-+    /* sub bytes */
-+    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
-+    // look up each of the table. After each lookup, we load the next table
-+    // which locates at the next 64-bytes. In the meantime, the index in the
-+    // table would be smaller than it was, so the index parameters of
-+    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
-+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-+    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
- 
--    // mix columns
-+    /* mix columns */
-     w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-     w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-     w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
- 
--    //  add round key
-+    /* add round key */
-     return vreinterpretq_m128i_u8(w) ^ RoundKey;
- 
- #else /* ARMv7-A implementation for a table-based AES */
-@@ -9587,31 +8576,34 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
- FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
- {
- #if defined(__aarch64__)
--    static const uint8_t inv_shift_rows[] = {0x0, 0xd, 0xa, 0x7, 0x4, 0x1,
--                                             0xe, 0xb, 0x8, 0x5, 0x2, 0xf,
--                                             0xc, 0x9, 0x6, 0x3};
--    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
--                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
-+    static const uint8_t inv_shift_rows[] = {
-+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-+    };
-+    static const uint8_t ror32by8[] = {
-+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-+    };
- 
-     uint8x16_t v;
-     uint8x16_t w = vreinterpretq_u8_m128i(a);
- 
--    // shift rows
-+    // inverse shift rows
-     w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
- 
--    // sub bytes
--    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_rsbox), w);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_rsbox + 0x40), w - 0x40);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_rsbox + 0x80), w - 0x80);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_rsbox + 0xc0), w - 0xc0);
-+    // inverse sub bytes
-+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
- 
-+    // inverse mix columns
-     // muliplying 'v' by 4 in GF(2^8)
-     w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-     w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
-     v ^= w;
-     v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
- 
--    // mix columns
-     w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
-                                  0x1b);  // muliplying 'v' by 2 in GF(2^8)
-     w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-@@ -9621,35 +8613,29 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
-     return vreinterpretq_m128i_u8(w) ^ RoundKey;
- 
- #else /* ARMv7-A NEON implementation */
--/* FIXME: optimized for NEON */
--#define XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
--#define MULTIPLY(x, y)                                                     \
--    (((y & 1) * x) ^ ((y >> 1 & 1) * XT(x)) ^ ((y >> 2 & 1) * XT(XT(x))) ^ \
--     ((y >> 3 & 1) * XT(XT(XT(x)))) ^ ((y >> 4 & 1) * XT(XT(XT(XT(x))))))
--
-+    /* FIXME: optimized for NEON */
-     uint8_t i, e, f, g, h, v[4][4];
-     uint8_t *_a = (uint8_t *) &a;
-     for (i = 0; i < 16; ++i) {
--        v[((i / 4) + (i % 4)) % 4][i % 4] = SSE2NEON_rsbox[_a[i]];
-+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
-     }
- 
-+    // inverse mix columns
-     for (i = 0; i < 4; ++i) {
-         e = v[i][0];
-         f = v[i][1];
-         g = v[i][2];
-         h = v[i][3];
- 
--        v[i][0] = MULTIPLY(e, 0x0e) ^ MULTIPLY(f, 0x0b) ^ MULTIPLY(g, 0x0d) ^
--                  MULTIPLY(h, 0x09);
--        v[i][1] = MULTIPLY(e, 0x09) ^ MULTIPLY(f, 0x0e) ^ MULTIPLY(g, 0x0b) ^
--                  MULTIPLY(h, 0x0d);
--        v[i][2] = MULTIPLY(e, 0x0d) ^ MULTIPLY(f, 0x09) ^ MULTIPLY(g, 0x0e) ^
--                  MULTIPLY(h, 0x0b);
--        v[i][3] = MULTIPLY(e, 0x0b) ^ MULTIPLY(f, 0x0d) ^ MULTIPLY(g, 0x09) ^
--                  MULTIPLY(h, 0x0e);
-+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
-+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
-+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
-+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
-+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
-+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
-+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
-+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
-     }
--#undef XT
--#undef MULTIPLY
- 
-     return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
- #endif
-@@ -9657,7 +8643,7 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
- 
- // Perform the last round of an AES encryption flow on data (state) in a using
- // the round key in RoundKey, and store the result in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
- FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
- {
- #if defined(__aarch64__)
-@@ -9673,59 +8659,166 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-     w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
- 
-     // sub bytes
--    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
--    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
-+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
- 
--    //  add round key
-+    // add round key
-     return vreinterpretq_m128i_u8(v) ^ RoundKey;
- 
- #else /* ARMv7-A implementation */
-     uint8_t v[16] = {
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
-     };
- 
-     return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
- #endif
- }
- 
-+// Perform the last round of an AES decryption flow on data (state) in a using
-+// the round key in RoundKey, and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
-+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
-+{
-+#if defined(__aarch64__)
-+    static const uint8_t inv_shift_rows[] = {
-+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-+    };
-+
-+    uint8x16_t v;
-+    uint8x16_t w = vreinterpretq_u8_m128i(a);
-+
-+    // inverse shift rows
-+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
-+
-+    // inverse sub bytes
-+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
-+
-+    // add round key
-+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
-+
-+#else /* ARMv7-A NEON implementation */
-+    /* FIXME: optimized for NEON */
-+    uint8_t v[4][4];
-+    uint8_t *_a = (uint8_t *) &a;
-+    for (int i = 0; i < 16; ++i) {
-+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
-+    }
-+
-+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
-+#endif
-+}
-+
-+// Perform the InvMixColumns transformation on a and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
-+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
-+{
-+#if defined(__aarch64__)
-+    static const uint8_t ror32by8[] = {
-+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-+    };
-+    uint8x16_t v = vreinterpretq_u8_m128i(a);
-+    uint8x16_t w;
-+
-+    // multiplying 'v' by 4 in GF(2^8)
-+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
-+    v ^= w;
-+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
-+
-+    // multiplying 'v' by 2 in GF(2^8)
-+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-+    return vreinterpretq_m128i_u8(w);
-+
-+#else /* ARMv7-A NEON implementation */
-+    uint8_t i, e, f, g, h, v[4][4];
-+    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
-+    for (i = 0; i < 4; ++i) {
-+        e = v[i][0];
-+        f = v[i][1];
-+        g = v[i][2];
-+        h = v[i][3];
-+
-+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
-+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
-+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
-+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
-+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
-+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
-+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
-+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
-+    }
-+
-+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
-+#endif
-+}
-+
-+// Assist in expanding the AES cipher key by computing steps towards generating
-+// a round key for encryption cipher using data from a and an 8-bit round
-+// constant specified in imm8, and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
-+//
- // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
- // This instruction generates a round key for AES encryption. See
- // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
- // for details.
--//
--// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
--FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
-+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
- {
--    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
--    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
-+#if defined(__aarch64__)
-+    uint8x16_t _a = vreinterpretq_u8_m128i(a);
-+    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
-+
-+    uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
-+    uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
-+    uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
-+
-+    return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
-+
-+#else /* ARMv7-A NEON implementation */
-+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
-+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
-     for (int i = 0; i < 4; ++i) {
--        ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
--        ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
-+        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
-+        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
-     }
-     return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
-                          ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
-+#endif
- }
- #undef SSE2NEON_AES_SBOX
- #undef SSE2NEON_AES_RSBOX
- 
-+#if defined(__aarch64__)
-+#undef SSE2NEON_XT
-+#undef SSE2NEON_MULTIPLY
-+#endif
-+
- #else /* __ARM_FEATURE_CRYPTO */
- // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
- // AESMC and then manually applying the real key as an xor operation. This
-@@ -9750,7 +8843,9 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
-         vreinterpretq_u8_m128i(RoundKey)));
- }
- 
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
-+// Perform the last round of an AES encryption flow on data (state) in a using
-+// the round key in RoundKey, and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
- FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
- {
-     return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
-@@ -9758,6 +8853,23 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-                          RoundKey);
- }
- 
-+// Perform the last round of an AES decryption flow on data (state) in a using
-+// the round key in RoundKey, and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
-+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
-+{
-+    return vreinterpretq_m128i_u8(
-+        vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)) ^
-+        vreinterpretq_u8_m128i(RoundKey));
-+}
-+
-+// Perform the InvMixColumns transformation on a and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
-+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
-+{
-+    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
-+}
-+
- // Assist in expanding the AES cipher key by computing steps towards generating
- // a round key for encryption cipher using data from a and an 8-bit round
- // constant specified in imm8, and store the result in dst."
-@@ -9783,7 +8895,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
- 
- // Perform a carry-less multiplication of two 64-bit integers, selected from a
- // and b according to imm8, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
- FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
- {
-     uint64x2_t a = vreinterpretq_u64_m128i(_a);
-@@ -9828,7 +8940,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
- 
- // Count the number of bits set to 1 in unsigned 32-bit integer a, and
- // return that count in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
- FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
- {
- #if defined(__aarch64__)
-@@ -9855,7 +8967,7 @@ FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
- 
- // Count the number of bits set to 1 in unsigned 64-bit integer a, and
- // return that count in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
- FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
- {
- #if defined(__aarch64__)
-@@ -9911,7 +9023,6 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
- 
- // Return the current 64-bit value of the processor's time-stamp counter.
- // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
--
- FORCE_INLINE uint64_t _rdtsc(void)
- {
- #if defined(__aarch64__)
diff --git a/bazel/patches/emp-tool.patch b/bazel/patches/emp-tool.patch
deleted file mode 100644
index ac40006..0000000
--- a/bazel/patches/emp-tool.patch
+++ /dev/null
@@ -1,175 +0,0 @@
-diff --git a/emp-tool/utils/aes.h b/emp-tool/utils/aes.h
-index 0235544..75a8486 100644
---- a/emp-tool/utils/aes.h
-+++ b/emp-tool/utils/aes.h
-@@ -54,6 +54,10 @@
- 
- #include "emp-tool/utils/block.h"
- 
-+#ifdef __aarch64__
-+#include "emp-tool/utils/sse2neon.h"
-+#endif
-+
- namespace emp {
- 
- typedef struct { block rd_key[11]; unsigned int rounds; } AES_KEY;
-@@ -103,6 +107,7 @@ AES_set_encrypt_key(const block userkey, AES_KEY *key) {
- 
- #ifdef __x86_64__
- __attribute__((target("aes,sse2")))
-+#endif
- inline void AES_ecb_encrypt_blks(block *blks, unsigned int nblks, const AES_KEY *key) {
-    for (unsigned int i = 0; i < nblks; ++i)
-       blks[i] = _mm_xor_si128(blks[i], key->rd_key[0]);
-@@ -112,22 +117,6 @@ inline void AES_ecb_encrypt_blks(block *blks, unsigned int nblks, const AES_KEY
-    for (unsigned int i = 0; i < nblks; ++i)
-       blks[i] = _mm_aesenclast_si128(blks[i], key->rd_key[key->rounds]);
- }
--#elif __aarch64__
--inline void AES_ecb_encrypt_blks(block *_blks, unsigned int nblks, const AES_KEY *key) {
--   uint8x16_t * blks = (uint8x16_t*)(_blks);
--   uint8x16_t * keys = (uint8x16_t*)(key->rd_key);
--   auto * first = blks;
--   for (unsigned int j = 0; j < key->rounds-1; ++j) {
--		uint8x16_t key_j = (uint8x16_t)keys[j];
--      blks = first;
--      for (unsigned int i = 0; i < nblks; ++i, ++blks)
--	       *blks = vaesmcq_u8(vaeseq_u8(*blks, key_j));
--   }
--	uint8x16_t last_key = (uint8x16_t)keys[key->rounds-1];
--	for (unsigned int i = 0; i < nblks; ++i, ++first)
--		 *first = vaeseq_u8(*first, last_key) ^ (uint8x16_t)keys[key->rounds];
--}
--#endif
- 
- #ifdef __GNUC__
- 	#ifndef __clang__
-diff --git a/emp-tool/utils/aes_opt.h b/emp-tool/utils/aes_opt.h
-index 2594e32..6a78b75 100644
---- a/emp-tool/utils/aes_opt.h
-+++ b/emp-tool/utils/aes_opt.h
-@@ -58,7 +58,6 @@ static inline void AES_opt_key_schedule(block* user_key, AES_KEY *keys) {
- /*
-  * With numKeys keys, use each key to encrypt numEncs blocks.
-  */
--#ifdef __x86_64__
- template<int numKeys, int numEncs>
- static inline void ParaEnc(block *blks, AES_KEY *keys) {
- 	block * first = blks;
-@@ -90,29 +89,6 @@ static inline void ParaEnc(block *blks, AES_KEY *keys) {
- 		}
- 	}
- }
--#elif __aarch64__
--template<int numKeys, int numEncs>
--static inline void ParaEnc(block *_blks, AES_KEY *keys) {
--	uint8x16_t * first = (uint8x16_t*)(_blks);
--
--	for (unsigned int r = 0; r < 9; ++r) { 
--		auto blks = first;
--		for(size_t i = 0; i < numKeys; ++i) {
--			uint8x16_t K = vreinterpretq_u8_m128i(keys[i].rd_key[r]);
--			for(size_t j = 0; j < numEncs; ++j, ++blks)
--			   *blks = vaesmcq_u8(vaeseq_u8(*blks, K));
--		}
--	}
--	
--	auto blks = first;
--	for(size_t i = 0; i < numKeys; ++i) {
--		uint8x16_t K = vreinterpretq_u8_m128i(keys[i].rd_key[9]);
--		uint8x16_t K2 = vreinterpretq_u8_m128i(keys[i].rd_key[10]);
--		for(size_t j = 0; j < numEncs; ++j, ++blks)
--			*blks = vaeseq_u8(*blks, K) ^ K2;
--	}
--}
--#endif
- 
- }
- #endif
-diff --git a/emp-tool/utils/block.h b/emp-tool/utils/block.h
-index f7d3d34..fcc21c1 100644
---- a/emp-tool/utils/block.h
-+++ b/emp-tool/utils/block.h
-@@ -5,16 +5,7 @@
- #include <immintrin.h>
- #elif __aarch64__
- #include "sse2neon.h"
--inline __m128i _mm_aesimc_si128(__m128i a) {
--	return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
--}
--
--inline __m128i _mm_aesdeclast_si128 (__m128i a, __m128i RoundKey)
--{
--    return vreinterpretq_m128i_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)) ^ vreinterpretq_u8_m128i(RoundKey));
--}
- #endif
--
- #include <assert.h>
- #include <cstring>
- #include <iostream>
-diff --git a/emp-tool/utils/f2k.h b/emp-tool/utils/f2k.h
-index 7fe1b1b..f6186a1 100644
---- a/emp-tool/utils/f2k.h
-+++ b/emp-tool/utils/f2k.h
-@@ -6,6 +6,7 @@ namespace emp {
- 	/* multiplication in galois field without reduction */
- 	#ifdef __x86_64__
- 	__attribute__((target("sse2,pclmul")))
-+	#endif
- 	inline void mul128(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
- 		__m128i tmp3, tmp4, tmp5, tmp6;
- 		tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
-@@ -22,28 +23,6 @@ namespace emp {
- 		*res1 = tmp3;
- 		*res2 = tmp6;
- 	}
--	#elif __aarch64__
--	inline void mul128(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
--		__m128i tmp3, tmp4, tmp5, tmp6;
--		poly64_t a_lo = (poly64_t)vget_low_u64(vreinterpretq_u64_m128i(a));
--		poly64_t a_hi = (poly64_t)vget_high_u64(vreinterpretq_u64_m128i(a));
--		poly64_t b_lo = (poly64_t)vget_low_u64(vreinterpretq_u64_m128i(b));
--		poly64_t b_hi = (poly64_t)vget_high_u64(vreinterpretq_u64_m128i(b));
--		tmp3 = (__m128i)vmull_p64(a_lo, b_lo);
--		tmp4 = (__m128i)vmull_p64(a_hi, b_lo);
--		tmp5 = (__m128i)vmull_p64(a_lo, b_hi);
--		tmp6 = (__m128i)vmull_p64(a_hi, b_hi);
--
--		tmp4 = _mm_xor_si128(tmp4, tmp5);
--		tmp5 = _mm_slli_si128(tmp4, 8);
--		tmp4 = _mm_srli_si128(tmp4, 8);
--		tmp3 = _mm_xor_si128(tmp3, tmp5);
--		tmp6 = _mm_xor_si128(tmp6, tmp4);
--		// initial mul now in tmp3, tmp6
--		*res1 = tmp3;
--		*res2 = tmp6;
--	}
--	#endif
- 
- 	/* multiplication in galois field with reduction */
- 	#ifdef __x86_64__
-diff --git a/emp-tool/utils/prg.h b/emp-tool/utils/prg.h
-index 23bbf42..5101d7e 100644
---- a/emp-tool/utils/prg.h
-+++ b/emp-tool/utils/prg.h
-@@ -82,7 +82,7 @@ class PRG { public:
- 		} else {
- 			block tmp[2];
- 			random_block(tmp, 2);
--			memcpy(data, tmp, nbytes);
-+			memcpy(data, tmp, nbytes <= 32? nbytes : 32);
- 		}
- 	}
- 
-diff --git a/emp-tool/utils/block.h b/emp-tool/utils/block.h
-index f7d3d34..3c25a73 100644
---- a/emp-tool/utils/block.h
-+++ b/emp-tool/utils/block.h
-@@ -19,6 +19,7 @@ inline __m128i _mm_aesdeclast_si128 (__m128i a, __m128i RoundKey)
- #include <cstring>
- #include <iostream>
- #include <iomanip>
-+#include <cstdint>
-
- namespace emp {
-
diff --git a/bazel/patches/flatbuffers.patch b/bazel/patches/flatbuffers.patch
deleted file mode 100644
index 6f9c4ee..0000000
--- a/bazel/patches/flatbuffers.patch
+++ /dev/null
@@ -1,306 +0,0 @@
-diff --git a/grpc/BUILD.bazel b/grpc/BUILD.bazel
-deleted file mode 100644
-index e69de29b..00000000
-diff --git a/grpc/src/compiler/BUILD.bazel b/grpc/src/compiler/BUILD.bazel
-deleted file mode 100644
-index 0efa9560..00000000
---- a/grpc/src/compiler/BUILD.bazel
-+++ /dev/null
-@@ -1,131 +0,0 @@
--load("@rules_cc//cc:defs.bzl", "cc_library")
--
--package(
--    default_visibility = ["//visibility:public"],
--)
--
--filegroup(
--    name = "distribution",
--    srcs = [
--        "BUILD.bazel",
--    ] + glob([
--        "*.cc",
--        "*.h",
--    ]),
--)
--
--filegroup(
--    name = "common_headers",
--    srcs = [
--        "schema_interface.h",
--    ],
--)
--
--cc_library(
--    name = "cpp_generator",
--    srcs = [
--        "cpp_generator.cc",
--    ],
--    hdrs = [
--        "cpp_generator.h",
--        ":common_headers",
--    ],
--    include_prefix = "src/compiler",
--    strip_include_prefix = "/grpc/src/compiler",
--    deps = [
--        "//:flatbuffers",
--    ],
--)
--
--cc_library(
--    name = "go_generator",
--    srcs = [
--        "go_generator.cc",
--    ],
--    hdrs = [
--        "go_generator.h",
--        ":common_headers",
--    ],
--    include_prefix = "src/compiler",
--    strip_include_prefix = "/grpc/src/compiler",
--    deps = [
--        "//:flatbuffers",
--    ],
--)
--
--cc_library(
--    name = "java_generator",
--    srcs = [
--        "java_generator.cc",
--    ],
--    hdrs = [
--        "java_generator.h",
--        ":common_headers",
--    ],
--    include_prefix = "src/compiler",
--    strip_include_prefix = "/grpc/src/compiler",
--    deps = [
--        "//:flatbuffers",
--    ],
--)
--
--cc_library(
--    name = "python_generator",
--    hdrs = [
--        "python_generator.h",
--    ],
--    include_prefix = "src/compiler",
--    strip_include_prefix = "/grpc/src/compiler",
--    deps = [
--        ":python_generator_private",
--    ],
--)
--
--cc_library(
--    name = "python_generator_private",
--    srcs = [
--        "python_generator.cc",
--    ],
--    hdrs = [
--        "python_generator.h",
--        ":common_headers",
--    ],
--    include_prefix = "src/compiler",
--    strip_include_prefix = "/grpc/src/compiler",
--    visibility = ["//visibility:private"],
--    deps = [
--        "//:flatbuffers",
--    ],
--)
--
--cc_library(
--    name = "swift_generator",
--    srcs = [
--        "swift_generator.cc",
--    ],
--    hdrs = [
--        "swift_generator.h",
--        ":common_headers",
--    ],
--    include_prefix = "src/compiler",
--    strip_include_prefix = "/grpc/src/compiler",
--    deps = [
--        "//:flatbuffers",
--    ],
--)
--
--cc_library(
--    name = "ts_generator",
--    srcs = [
--        "ts_generator.cc",
--    ],
--    hdrs = [
--        "ts_generator.h",
--        ":common_headers",
--    ],
--    include_prefix = "src/compiler",
--    strip_include_prefix = "/grpc/src/compiler",
--    deps = [
--        "//:flatbuffers",
--    ],
--)
-diff --git a/src/BUILD.bazel b/src/BUILD.bazel
-deleted file mode 100644
-index 679b10f7..00000000
---- a/src/BUILD.bazel
-+++ /dev/null
-@@ -1,159 +0,0 @@
--# @unused
--load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
--
--package(
--    default_visibility = ["//visibility:private"],
--)
--
--filegroup(
--    name = "distribution",
--    srcs = [
--        "BUILD.bazel",
--    ] + glob([
--        "*.cpp",
--        "*.h",
--    ]),
--    visibility = ["//visibility:public"],
--)
--
--cc_library(
--    name = "code_generators",
--    srcs = ["code_generators.cpp"],
--    hdrs = [
--        "//:public_headers",
--    ],
--    strip_include_prefix = "/include",
--    visibility = ["//:__subpackages__"],
--)
--
--cc_library(
--    name = "generate_fbs",
--    srcs = ["idl_gen_fbs.cpp"],
--    hdrs = ["idl_gen_fbs.h"],
--    strip_include_prefix = "/src",
--    visibility = ["//:__subpackages__"],
--    deps = [":code_generators"],
--)
--
--# Public flatc library to compile flatbuffer files at runtime.
--cc_library(
--    name = "flatbuffers",
--    srcs = [
--        "idl_gen_text.cpp",
--        "idl_gen_text.h",
--        "idl_parser.cpp",
--        "reflection.cpp",
--        "util.cpp",
--    ],
--    hdrs = [
--        "//:public_headers",
--    ],
--    linkopts = select({
--        # TODO: Bazel uses `clang` instead of `clang++` to link
--        # C++ code on BSD. Temporarily adding these linker flags while
--        # we wait for Bazel to resolve
--        # https://github.com/bazelbuild/bazel/issues/12023.
--        "//:platform_freebsd": ["-lm"],
--        "//:platform_openbsd": ["-lm"],
--        "//conditions:default": [],
--    }),
--    strip_include_prefix = "/include",
--    visibility = ["//:__subpackages__"],
--    deps = [
--        ":code_generators",
--        ":generate_fbs",
--    ],
--)
--
--# Public flatc compiler library.
--cc_library(
--    name = "flatc_library",
--    srcs = [
--        "annotated_binary_text_gen.cpp",
--        "annotated_binary_text_gen.h",
--        "bfbs_gen.h",
--        "bfbs_gen_lua.cpp",
--        "bfbs_gen_lua.h",
--        "bfbs_gen_nim.cpp",
--        "bfbs_gen_nim.h",
--        "bfbs_namer.h",
--        "binary_annotator.cpp",
--        "binary_annotator.h",
--        "flatc.cpp",
--        "namer.h",
--    ],
--    hdrs = [
--        "//:flatc_headers",
--    ],
--    strip_include_prefix = "/include",
--    visibility = ["//:__pkg__"],
--    deps = [
--        ":flatbuffers",
--    ],
--)
--
--# Public flatc compiler.
--cc_library(
--    name = "flatc",
--    srcs = [
--        "bfbs_gen.h",
--        "bfbs_gen_lua.cpp",
--        "bfbs_gen_lua.h",
--        "bfbs_gen_nim.cpp",
--        "bfbs_gen_nim.h",
--        "bfbs_namer.h",
--        "file_binary_writer.cpp",
--        "file_name_saving_file_manager.cpp",
--        "file_writer.cpp",
--        "flatc_main.cpp",
--        "idl_gen_binary.cpp",
--        "idl_gen_binary.h",
--        "idl_gen_cpp.cpp",
--        "idl_gen_cpp.h",
--        "idl_gen_csharp.cpp",
--        "idl_gen_csharp.h",
--        "idl_gen_dart.cpp",
--        "idl_gen_dart.h",
--        "idl_gen_go.cpp",
--        "idl_gen_go.h",
--        "idl_gen_grpc.cpp",
--        "idl_gen_java.cpp",
--        "idl_gen_java.h",
--        "idl_gen_json_schema.cpp",
--        "idl_gen_json_schema.h",
--        "idl_gen_kotlin.cpp",
--        "idl_gen_kotlin.h",
--        "idl_gen_kotlin_kmp.cpp",
--        "idl_gen_lobster.cpp",
--        "idl_gen_lobster.h",
--        "idl_gen_php.cpp",
--        "idl_gen_php.h",
--        "idl_gen_python.cpp",
--        "idl_gen_python.h",
--        "idl_gen_rust.cpp",
--        "idl_gen_rust.h",
--        "idl_gen_swift.cpp",
--        "idl_gen_swift.h",
--        "idl_gen_text.cpp",
--        "idl_gen_text.h",
--        "idl_gen_ts.cpp",
--        "idl_gen_ts.h",
--        "idl_namer.h",
--        "namer.h",
--        "util.cpp",
--    ],
--    hdrs = [
--        "//:flatc_headers",
--    ],
--    strip_include_prefix = "/include",
--    visibility = ["//:__pkg__"],
--    deps = [
--        ":flatc_library",
--        "//grpc/src/compiler:cpp_generator",
--        "//grpc/src/compiler:go_generator",
--        "//grpc/src/compiler:java_generator",
--        "//grpc/src/compiler:python_generator",
--        "//grpc/src/compiler:swift_generator",
--        "//grpc/src/compiler:ts_generator",
--    ],
--)
---
diff --git a/bazel/patches/grpc-1.66.patch b/bazel/patches/grpc-1.66.patch
new file mode 100644
index 0000000..b6f82e5
--- /dev/null
+++ b/bazel/patches/grpc-1.66.patch
@@ -0,0 +1,20 @@
+diff --git a/third_party/BUILD b/third_party/BUILD
+index 77cb52d0fc..c4b647f5c9 100644
+--- a/third_party/BUILD
++++ b/third_party/BUILD
+@@ -18,13 +18,13 @@ package(default_visibility = ["//:__subpackages__"])
+ 
+ alias(
+     name = "libssl",
+-    actual = "@boringssl//:ssl",
++    actual = "@openssl//:ssl",
+     tags = ["manual"],
+ )
+ 
+ alias(
+     name = "libcrypto",
+-    actual = "@boringssl//:crypto",
++    actual = "@openssl//:crypto",
+     tags = ["manual"],
+ )
+ 
diff --git a/bazel/patches/grpc-module-file.patch b/bazel/patches/grpc-module-file.patch
new file mode 100644
index 0000000..29dc393
--- /dev/null
+++ b/bazel/patches/grpc-module-file.patch
@@ -0,0 +1,13 @@
+diff --git a/MODULE.bazel b/MODULE.bazel
+index 4a8fbe83..8650f678 100644
+--- a/MODULE.bazel
++++ b/MODULE.bazel
+@@ -8,7 +8,7 @@ module(
+ bazel_dep(name = "abseil-cpp", version = "20240116.0", repo_name = "com_google_absl")
+ bazel_dep(name = "apple_support", version = "1.15.1", repo_name = "build_bazel_apple_support")
+ bazel_dep(name = "bazel_skylib", version = "1.5.0")
+-bazel_dep(name = "boringssl", version = "0.0.0-20230215-5c22014")
++bazel_dep(name = "openssl", version = "3.3.2")
+ bazel_dep(name = "c-ares", version = "1.15.0", repo_name = "com_github_cares_cares")
+ bazel_dep(name = "gazelle", version = "0.36.0", repo_name = "bazel_gazelle")
+ bazel_dep(name = "google_benchmark", version = "1.8.4", repo_name = "com_github_google_benchmark")
\ No newline at end of file
diff --git a/bazel/patches/grpc.patch b/bazel/patches/grpc.patch
deleted file mode 100644
index fd8e09f..0000000
--- a/bazel/patches/grpc.patch
+++ /dev/null
@@ -1,32 +0,0 @@
-diff --git a/bazel/grpc_deps.bzl b/bazel/grpc_deps.bzl
-index 5e65a65df4..03bbd2361e 100644
---- a/bazel/grpc_deps.bzl
-+++ b/bazel/grpc_deps.bzl
-@@ -57,12 +57,12 @@ def grpc_deps():
-
-     native.bind(
-         name = "libssl",
--        actual = "@boringssl//:ssl",
-+        actual = "@com_github_openssl_openssl//:openssl",
-     )
-
-     native.bind(
-         name = "libcrypto",
--        actual = "@boringssl//:crypto",
-+        actual = "@com_github_openssl_openssl//:openssl",
-     )
-
-     native.bind(
-diff --git a/src/core/lib/iomgr/tcp_posix.cc b/src/core/lib/iomgr/tcp_posix.cc
-index 72e1b6609e..aded52d0db 100644
---- a/src/core/lib/iomgr/tcp_posix.cc
-+++ b/src/core/lib/iomgr/tcp_posix.cc
-@@ -41,6 +41,8 @@
- #include <algorithm>
- #include <unordered_map>
- 
-+#include "absl/strings/str_cat.h"
-+
- #include <grpc/slice.h>
- #include <grpc/support/alloc.h>
- #include <grpc/support/log.h>
diff --git a/bazel/patches/ippcp.patch b/bazel/patches/ippcp.patch
deleted file mode 100644
index 0af05b2..0000000
--- a/bazel/patches/ippcp.patch
+++ /dev/null
@@ -1,250 +0,0 @@
-diff --git a/sources/cmake/linux/GNU8.2.0.cmake b/sources/cmake/linux/GNU8.2.0.cmake
-index 24d7e0f..15dd433 100644
---- a/sources/cmake/linux/GNU8.2.0.cmake
-+++ b/sources/cmake/linux/GNU8.2.0.cmake
-@@ -32,7 +32,7 @@ set(LINK_FLAG_DYNAMIC_LINUX "${LINK_FLAG_SECURITY} -nostdlib")
- # Dynamically link lib c (libdl is for old apps)
- set(LINK_FLAG_DYNAMIC_LINUX "${LINK_FLAG_DYNAMIC_LINUX} -Wl,-call_shared,-lc")
- # Create a shared library
--set(LINK_FLAG_DYNAMIC_LINUX "-Wl,-shared")
-+set(LINK_FLAG_DYNAMIC_LINUX "-Wl,-shared,-fuse-ld=bfd")
- if(${ARCH} MATCHES "ia32")
-   # Tells the compiler to generate code for a specific architecture (32)
-   set(LINK_FLAG_DYNAMIC_LINUX "${LINK_FLAG_DYNAMIC_LINUX} -m32")
-@@ -74,7 +74,7 @@ if ((${ARCH} MATCHES "ia32") OR (NOT NONPIC_LIB))
- endif()
-
- # Security flag that adds compile-time and run-time checks
--set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_FORTIFY_SOURCE=2")
-+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2")
-
- if(NOT NONPIC_LIB)
-   # Position Independent Execution (PIE)
-@@ -95,6 +95,8 @@ if(${ARCH} MATCHES "ia32")
-   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m32")
- endif(${ARCH} MATCHES "ia32")
-
-+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function -Wno-unused-variable -Wno-unused-but-set-variable -Wno-unknown-pragmas -Wno-missing-braces -Wno-comment -Wno-strict-aliasing -Wno-parentheses -Wno-array-parameter")
-+
- # Optimization level = 3, no-debug definition (turns off asserts), warnings=errors
- set (CMAKE_C_FLAGS_RELEASE " -O3 -DNDEBUG -Werror")
-
-diff --git a/sources/cmake/macosx/AppleClang11.0.0.cmake b/sources/cmake/macosx/AppleClang11.0.0.cmake
-index 5b92877..ccb963e 100644
---- a/sources/cmake/macosx/AppleClang11.0.0.cmake
-+++ b/sources/cmake/macosx/AppleClang11.0.0.cmake
-@@ -20,12 +20,6 @@
-
- # Security Linker flags
- set(LINK_FLAG_SECURITY "")
--# Disallows undefined symbols in object files. Undefined symbols in shared libraries are still allowed
--set(LINK_FLAG_SECURITY "${LINK_FLAG_SECURITY} -Wl,-z,defs")
--# Stack execution protection
--set(LINK_FLAG_SECURITY "${LINK_FLAG_SECURITY} -Wl,-z,noexecstack")
--# Data relocation and protection (RELRO)
--set(LINK_FLAG_SECURITY "${LINK_FLAG_SECURITY} -Wl,-z,relro -Wl,-z,now")
- # Prevents the compiler from using standard libraries and startup files when linking.
- set(LINK_FLAG_DYNAMIC_MACOSX "${LINK_FLAG_SECURITY} -nostdlib")
- # Dynamically link lib c (libdl is for old apps)
-@@ -79,7 +73,7 @@ if ((${ARCH} MATCHES "ia32") OR (NOT NONPIC_LIB))
- endif()
-
- # Security flag that adds compile-time and run-time checks
--set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_FORTIFY_SOURCE=2")
-+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2")
-
- if(NOT NONPIC_LIB)
-   # Position Independent Execution (PIE)
-@@ -98,6 +92,8 @@ if(${ARCH} MATCHES "ia32")
-   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m32")
- endif(${ARCH} MATCHES "ia32")
-
-+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-missing-braces -Wno-unused-command-line-argument -Wno-unused-but-set-variable -Wno-unknown-warning-option")
-+
- # Optimization level = 3, no-debug definition (turns off asserts), warnings=errors
- set (CMAKE_C_FLAGS_RELEASE " -O3 -DNDEBUG -Werror")
-
-@@ -115,3 +111,5 @@ set(l9_opt "${l9_opt} -march=haswell -mavx2 -maes -mpclmul -msha -mrdrnd -mrdsee
- set(n0_opt "${n0_opt} -march=knl -mavx2 -maes -mavx512f -mavx512cd -mavx512pf -mavx512er -mpclmul -msha -mrdrnd -mrdseed")
- set(k0_opt "${k0_opt} -march=skylake-avx512")
- set(k0_opt "${k0_opt} -maes -mavx512f -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mpclmul -msha -mrdrnd -mrdseed -madx -mgfni -mvaes -mvpclmulqdq -mavx512vbmi -mavx512vbmi2")
-+set(k1_opt "${k1_opt} -march=skylake-avx512")
-+set(k1_opt "${k1_opt} -maes -mavx512f -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mpclmul -msha -mrdrnd -mrdseed -madx -mgfni -mvaes -mvpclmulqdq -mavx512vbmi -mavx512vbmi2")
-diff --git a/sources/cmake/macosx/common.cmake b/sources/cmake/macosx/common.cmake
-index 85ec3ad..67bb9f9 100644
---- a/sources/cmake/macosx/common.cmake
-+++ b/sources/cmake/macosx/common.cmake
-@@ -18,7 +18,7 @@
- # Intel® Integrated Performance Primitives Cryptography (Intel® IPP Cryptography)
- #
-
--set(OS_DEFAULT_COMPILER Intel19.0.0)
-+set(OS_DEFAULT_COMPILER AppleClang11.0.0)
-
- set(LIBRARY_DEFINES "${LIBRARY_DEFINES} -DIPP_PIC -DOSXEM64T -DLINUX32E -D_ARCH_EM64T")
- #set(LIBRARY_DEFINES "${LIBRARY_DEFINES} -DBN_OPENSSL_DISABLE")
-\ No newline at end of file
-diff --git a/sources/ippcp/crypto_mb/src/cmake/linux/GNU.cmake b/sources/ippcp/crypto_mb/src/cmake/linux/GNU.cmake
-index a2abeeb..67aca8b 100644
---- a/sources/ippcp/crypto_mb/src/cmake/linux/GNU.cmake
-+++ b/sources/ippcp/crypto_mb/src/cmake/linux/GNU.cmake
-@@ -31,7 +31,7 @@ set(CMAKE_C_FLAGS_SECURITY "${CMAKE_C_FLAGS_SECURITY} -Wformat -Wformat-security
- if(${CMAKE_BUILD_TYPE} STREQUAL "Release")
-     if(NOT DEFINED NO_FORTIFY_SOURCE)
-         # Security flag that adds compile-time and run-time checks. 
--        set(CMAKE_C_FLAGS_SECURITY "${CMAKE_C_FLAGS_SECURITY} -D_FORTIFY_SOURCE=2")
-+        set(CMAKE_C_FLAGS_SECURITY "${CMAKE_C_FLAGS_SECURITY} -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2")
-     endif()
- endif()
-
-@@ -51,7 +51,7 @@ set(CMAKE_C_FLAGS_SECURITY "${CMAKE_C_FLAGS_SECURITY} -Werror")
- # Linker flags
-
- # Create shared library
--set(LINK_FLAGS_DYNAMIC " -Wl,-shared")
-+set(LINK_FLAGS_DYNAMIC " -Wl,-shared,-fuse-ld=bfd")
- # Add export files
- set(LINK_FLAGS_DYNAMIC "${LINK_FLAGS_DYNAMIC} ${CRYPTO_MB_SOURCES_DIR}/cmake/dll_export/crypto_mb.linux.lib-export")
-
-@@ -69,6 +69,7 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
-
- # Suppress warnings from casts from a pointer to an integer type of a different size
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pointer-to-int-cast")
-+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function -Wno-unused-variable -Wno-unused-but-set-variable -Wno-unknown-pragmas -Wno-missing-braces -Wno-comment -Wno-strict-aliasing -Wno-parentheses")
-
- # Optimization level = 3, no-debug definition (turns off asserts)
- set(CMAKE_C_FLAGS_RELEASE " -O3 -DNDEBUG")
-diff --git a/sources/ippcp/crypto_mb/src/cmake/macosx/AppleClang.cmake b/sources/ippcp/crypto_mb/src/cmake/macosx/AppleClang.cmake
-index ea1641d..f98fc2d 100644
---- a/sources/ippcp/crypto_mb/src/cmake/macosx/AppleClang.cmake
-+++ b/sources/ippcp/crypto_mb/src/cmake/macosx/AppleClang.cmake
-@@ -17,10 +17,6 @@
- # Security Linker flags
-
- set(LINK_FLAG_SECURITY "") 
--# Data relocation and protection (RELRO)
--set(LINK_FLAG_SECURITY "${LINK_FLAG_SECURITY} -Wl,-z,relro -Wl,-z,now")
--# Stack execution protection
--set(LINK_FLAG_SECURITY "${LINK_FLAG_SECURITY} -Wl,-z,noexecstack")
-
- # Security Compiler flags
-
-@@ -30,7 +26,7 @@ set(CMAKE_C_FLAGS_SECURITY "${CMAKE_C_FLAGS_SECURITY} -Wformat -Wformat-security
-
- if(${CMAKE_BUILD_TYPE} STREQUAL "Release")
-     # Security flag that adds compile-time and run-time checks. 
--    set(CMAKE_C_FLAGS_SECURITY "${CMAKE_C_FLAGS_SECURITY} -D_FORTIFY_SOURCE=2")
-+    set(CMAKE_C_FLAGS_SECURITY "${CMAKE_C_FLAGS_SECURITY} -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2")
- endif()
-
- # Stack-based Buffer Overrun Detection
-@@ -65,6 +61,8 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
- # Suppress warnings from casts from a pointer to an integer type of a different size
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pointer-to-int-cast")
-
-+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-missing-braces -Wno-unknown-warning-option")
-+
- # Optimization level = 3, no-debug definition (turns off asserts)
- set(CMAKE_C_FLAGS_RELEASE " -O3 -DNDEBUG")
- set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
-
-diff --git a/sources/cmake/linux/Clang9.0.0.cmake b/sources/cmake/linux/Clang9.0.0.cmake
-index 0015431..f93411c 100644
---- a/sources/cmake/linux/Clang9.0.0.cmake
-+++ b/sources/cmake/linux/Clang9.0.0.cmake
-@@ -79,7 +79,7 @@ endif()
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fcf-protection=full")
-
- # Security flag that adds compile-time and run-time checks
--set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_FORTIFY_SOURCE=2")
-+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-
- if(NOT NONPIC_LIB)
-   # Position Independent Execution (PIE)
-@@ -107,7 +107,7 @@ if(SANITIZERS)
- endif(SANITIZERS)
-
- # Optimization level = 3, no-debug definition (turns off asserts), warnings=errors
--set (CMAKE_C_FLAGS_RELEASE " -O3 -DNDEBUG -Werror")
-+set (CMAKE_C_FLAGS_RELEASE " -O3 -DNDEBUG -Werror -Wno-unused-function -Wno-missing-braces -Wno-unused-but-set-variable -Wno-unknown-pragmas")
-
- # DEBUG flags - optimization level = 0, generation GDB information (-g)
- set (CMAKE_C_FLAGS_DEBUG " -O0 -g")
-
-diff --git a/sources/include/dispatcher.h b/sources/include/dispatcher.h
-index 8290df6..a2f93d7 100644
---- a/sources/include/dispatcher.h
-+++ b/sources/include/dispatcher.h
-@@ -92,9 +92,13 @@ extern "C" {
-   #define LIB_W7 LIB_S8
- #elif defined( _ARCH_EM64T ) && !defined( OSXEM64T ) && !defined( WIN32E ) /* Linux* OS Intel64 supports N0 */
-   enum lib_enum {
--     LIB_M7=0, LIB_N8=1, LIB_Y8=2, LIB_E9=3, LIB_L9=4, LIB_N0=5, LIB_K0=6, LIB_K1=7,LIB_NOMORE
-+     LIB_E9=0, LIB_L9=1, LIB_K0=2, LIB_K1=3,LIB_NOMORE
-   };
--  #define LIB_PX LIB_M7
-+  #define LIB_PX LIB_E9
-+  #define LIB_M7 LIB_E9
-+  #define LIB_N8 LIB_E9
-+  #define LIB_Y8 LIB_E9
-+  #define LIB_N0 LIB_L9
- #elif defined( _ARCH_EM64T ) && !defined( OSXEM64T ) /* Windows* OS Intel64 doesn't support N0 */
-   enum lib_enum {
-      LIB_M7=0, LIB_N8=1, LIB_Y8=2, LIB_E9=3, LIB_L9=4, LIB_K0=5, LIB_K1=6, LIB_NOMORE
-@@ -103,11 +107,12 @@ extern "C" {
-   #define LIB_N0 LIB_L9
- #elif defined( OSXEM64T )
-   enum lib_enum {
--     LIB_Y8=0, LIB_E9=1, LIB_L9=2, LIB_K0=3, LIB_K1=4, LIB_NOMORE
-+     LIB_E9=0, LIB_L9=1, LIB_K0=2, LIB_K1=3, LIB_NOMORE
-   };
--  #define LIB_PX LIB_Y8
--  #define LIB_M7 LIB_Y8
--  #define LIB_N8 LIB_Y8
-+  #define LIB_PX LIB_E9
-+  #define LIB_M7 LIB_E9
-+  #define LIB_N8 LIB_E9
-+  #define LIB_Y8 LIB_E9
-   #define LIB_N0 LIB_L9
- #elif defined( _ARCH_LRB2 )
-   enum lib_enum {
-diff --git a/sources/include/owndefs.h b/sources/include/owndefs.h
-index dcc1ede..7c1e93e 100644
---- a/sources/include/owndefs.h
-+++ b/sources/include/owndefs.h
-@@ -632,14 +632,14 @@ extern double            __intel_castu64_f64(unsigned __int64 val);
-
-       #elif defined(linux)
-          /* LIN-32, LIN-64 */
--         #if ( defined(_W7) || defined(_M7) )
-+         #if ( defined(_W7) || defined(_E9) )
-          #define _IPP_DATA 1
-          #endif
-
-
-       /* OSX-32, OSX-64 */
-       #elif defined(OSX32) || defined(OSXEM64T)
--         #if ( defined(_Y8) )
-+         #if ( defined(_E9) )
-          #define _IPP_DATA 1
-          #endif
-       #endif
-diff --git a/sources/ippcp/CMakeLists.txt b/sources/ippcp/CMakeLists.txt
-index 315d1a3..8b11c7a 100644
---- a/sources/ippcp/CMakeLists.txt
-+++ b/sources/ippcp/CMakeLists.txt
-@@ -40,12 +40,12 @@ if(WIN32)
- endif(WIN32)
- if(UNIX)
-   if(APPLE)
--    set(BASE_PLATFORM_LIST ${BASE_PLATFORM_LIST} y8 e9 l9 k0 k1)
-+    set(BASE_PLATFORM_LIST ${BASE_PLATFORM_LIST} e9 l9 k0 k1)
-   else()
-     if (${ARCH} MATCHES "ia32")
-       set(BASE_PLATFORM_LIST ${BASE_PLATFORM_LIST} w7 s8 p8 g9 h9)
-     else()
--      set(BASE_PLATFORM_LIST ${BASE_PLATFORM_LIST} m7 n8 y8 e9 l9 n0 k0 k1)
-+      set(BASE_PLATFORM_LIST ${BASE_PLATFORM_LIST} e9 l9 k0 k1)
-     endif(${ARCH} MATCHES "ia32")
-   endif(APPLE)
- endif(UNIX)
diff --git a/bazel/patches/upb.patch b/bazel/patches/upb.patch
deleted file mode 100644
index da3b828..0000000
--- a/bazel/patches/upb.patch
+++ /dev/null
@@ -1,29 +0,0 @@
-diff --git a/bazel/build_defs.bzl b/bazel/build_defs.bzl
-index b5bc64f0..dc30a75f 100644
---- a/bazel/build_defs.bzl
-+++ b/bazel/build_defs.bzl
-@@ -38,7 +38,7 @@ _DEFAULT_CPPOPTS.extend([
-     "-Wno-long-long",
- ])
- _DEFAULT_COPTS.extend([
--    "-std=c99",
-+    "-std=c11",
-     "-pedantic",
-     "-Werror=pedantic",
-     "-Wall",
-diff --git a/upb/port_def.inc b/upb/port_def.inc
-index 92e4bf24..e355ace7 100644
---- a/upb/port_def.inc
-+++ b/upb/port_def.inc
-@@ -92,7 +92,11 @@
- #define UPB_ALIGN_UP(size, align) (((size) + (align) - 1) / (align) * (align))
- #define UPB_ALIGN_DOWN(size, align) ((size) / (align) * (align))
- #define UPB_ALIGN_MALLOC(size) UPB_ALIGN_UP(size, UPB_MALLOC_ALIGN)
-+#ifdef __clang__
-+#define UPB_ALIGN_OF(type) _Alignof(type)
-+#else
- #define UPB_ALIGN_OF(type) offsetof (struct { char c; type member; }, member)
-+#endif
-
- /* Hints to the compiler about likely/unlikely branches. */
- #if defined (__GNUC__) || defined(__clang__)
diff --git a/bazel/psi.bzl b/bazel/psi.bzl
index 50f0151..ae8a957 100644
--- a/bazel/psi.bzl
+++ b/bazel/psi.bzl
@@ -17,7 +17,7 @@ warpper bazel cc_xx to modify flags.
 """
 
 load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
-load("@yacl//bazel:yacl.bzl", "OMP_CFLAGS", "OMP_DEPS", "OMP_LINKFLAGS", "yacl_cmake_external")
+load("@yacl//bazel:yacl.bzl", "OMP_CFLAGS", "OMP_DEPS", "OMP_LINKFLAGS")
 
 WARNING_FLAGS = [
     "-Wall",
@@ -55,7 +55,7 @@ def psi_cc_library(
         linkopts = linkopts + OMP_LINKFLAGS,
         copts = _psi_copts() + copts + OMP_CFLAGS,
         deps = deps + [
-            "@com_github_gabime_spdlog//:spdlog",
+            "@spdlog//:spdlog",
         ] + OMP_DEPS,
         **kargs
     )
@@ -85,8 +85,8 @@ def psi_cc_test(
         # -lm for tcmalloc
         linkopts = linkopts + ["-lm", "-ldl"],
         copts = _psi_copts() + copts,
-        deps = deps + [
+        deps = [
             "@com_google_googletest//:gtest_main",
-        ],
+        ] + deps,
         **kwargs
     )
diff --git a/bazel/repositories.bzl b/bazel/repositories.bzl
index a2e01d5..48e8d6a 100644
--- a/bazel/repositories.bzl
+++ b/bazel/repositories.bzl
@@ -16,63 +16,26 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
 
 def psi_deps():
-    _com_github_nelhage_rules_boost()
-    _bazel_platform()
-    _upb()
-    _com_github_emptoolkit_emp_tool()
     _com_github_facebook_zstd()
     _com_github_microsoft_seal()
     _com_github_microsoft_apsi()
     _com_github_microsoft_gsl()
     _com_github_microsoft_kuku()
     _com_google_flatbuffers()
-    _org_apache_arrow()
-    _com_github_grpc_grpc()
-    _com_github_tencent_rapidjson()
-    _com_github_xtensor_xsimd()
-    _brotli()
-    _com_github_lz4_lz4()
-    _org_apache_thrift()
-    _com_google_double_conversion()
-    _bzip2()
-    _com_github_google_snappy()
+
     _com_github_google_perfetto()
     _com_github_floodyberry_curve25519_donna()
-    _com_github_ridiculousfish_libdivide()
-    _com_github_sparsehash_sparsehash()
-    _com_github_intel_ipp()
-    _yacl()
+
     _com_github_zeromq_cppzmq()
     _com_github_zeromq_libzmq()
     _com_github_log4cplus_log4cplus()
     _com_github_open_source_parsers_jsoncpp()
 
-def _yacl():
-    maybe(
-        http_archive,
-        name = "yacl",
-        urls = [
-            "https://github.com/secretflow/yacl/archive/refs/tags/0.4.5b5_nightly_20240913.tar.gz",
-        ],
-        strip_prefix = "yacl-0.4.5b5_nightly_20240913",
-        sha256 = "04b332246e3ccb57b5dd612353ed2e84f894e5537a3e854c020c8172793c07d6",
-    )
-
-def _bazel_platform():
-    http_archive(
-        name = "platforms",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.8/platforms-0.0.8.tar.gz",
-            "https://github.com/bazelbuild/platforms/releases/download/0.0.8/platforms-0.0.8.tar.gz",
-        ],
-        sha256 = "8150406605389ececb6da07cbcb509d5637a3ab9a24bc69b1101531367d89d74",
-    )
-
 def _com_github_facebook_zstd():
     maybe(
         http_archive,
-        name = "com_github_facebook_zstd",
-        build_file = "@psi//bazel:zstd.BUILD",
+        name = "zstd",
+        build_file = "//bazel:zstd.BUILD",
         strip_prefix = "zstd-1.5.5",
         sha256 = "98e9c3d949d1b924e28e01eccb7deed865eefebf25c2f21c702e5cd5b63b85e1",
         type = ".tar.gz",
@@ -81,61 +44,10 @@ def _com_github_facebook_zstd():
         ],
     )
 
-def _upb():
-    maybe(
-        http_archive,
-        name = "upb",
-        sha256 = "017a7e8e4e842d01dba5dc8aa316323eee080cd1b75986a7d1f94d87220e6502",
-        strip_prefix = "upb-e4635f223e7d36dfbea3b722a4ca4807a7e882e2",
-        urls = [
-            "https://storage.googleapis.com/grpc-bazel-mirror/github.com/protocolbuffers/upb/archive/e4635f223e7d36dfbea3b722a4ca4807a7e882e2.tar.gz",
-            "https://github.com/protocolbuffers/upb/archive/e4635f223e7d36dfbea3b722a4ca4807a7e882e2.tar.gz",
-        ],
-        patch_args = ["-p1"],
-        patches = [
-            "@psi//bazel/patches:upb.patch",
-        ],
-    )
-
-def _com_github_emptoolkit_emp_tool():
-    maybe(
-        http_archive,
-        name = "com_github_emptoolkit_emp_tool",
-        sha256 = "b9ab2380312e78020346b5d2db3d0244c7bd8098cb50f8b3620532ef491808d0",
-        strip_prefix = "emp-tool-0.2.5",
-        type = "tar.gz",
-        patch_args = ["-p1"],
-        patches = [
-            "@psi//bazel/patches:emp-tool.patch",
-            "@psi//bazel/patches:emp-tool-cmake.patch",
-            "@psi//bazel/patches:emp-tool-sse2neon.patch",
-        ],
-        urls = [
-            "https://github.com/emp-toolkit/emp-tool/archive/refs/tags/0.2.5.tar.gz",
-        ],
-        build_file = "@psi//bazel:emp-tool.BUILD",
-    )
-
-def _com_github_intel_ipp():
-    maybe(
-        http_archive,
-        name = "com_github_intel_ipp",
-        sha256 = "d70f42832337775edb022ca8ac1ac418f272e791ec147778ef7942aede414cdc",
-        strip_prefix = "cryptography-primitives-ippcp_2021.8",
-        build_file = "@psi//bazel:ipp.BUILD",
-        patch_args = ["-p1"],
-        patches = [
-            "@psi//bazel/patches:ippcp.patch",
-        ],
-        urls = [
-            "https://github.com/intel/cryptography-primitives/archive/refs/tags/ippcp_2021.8.tar.gz",
-        ],
-    )
-
 def _com_github_microsoft_seal():
     maybe(
         http_archive,
-        name = "com_github_microsoft_seal",
+        name = "seal",
         sha256 = "af9bf0f0daccda2a8b7f344f13a5692e0ee6a45fea88478b2b90c35648bf2672",
         strip_prefix = "SEAL-4.1.1",
         type = "tar.gz",
@@ -150,7 +62,7 @@ def _com_github_microsoft_seal():
 def _com_github_microsoft_apsi():
     maybe(
         http_archive,
-        name = "com_github_microsoft_apsi",
+        name = "apsi",
         sha256 = "82c0f9329c79222675109d4a3682d204acd3ea9a724bcd98fa58eabe53851333",
         strip_prefix = "APSI-0.11.0",
         urls = [
@@ -183,7 +95,7 @@ def _com_github_microsoft_gsl():
 def _com_github_microsoft_kuku():
     maybe(
         http_archive,
-        name = "com_github_microsoft_kuku",
+        name = "kuku",
         sha256 = "96ed5fad82ea8c8a8bb82f6eaf0b5dce744c0c2566b4baa11d8f5443ad1f83b7",
         strip_prefix = "Kuku-2.1.0",
         type = "tar.gz",
@@ -211,149 +123,10 @@ def _com_google_flatbuffers():
         build_file = "@psi//bazel:flatbuffers.BUILD",
     )
 
-def _org_apache_arrow():
-    maybe(
-        http_archive,
-        name = "org_apache_arrow",
-        urls = [
-            "https://github.com/apache/arrow/archive/apache-arrow-10.0.0.tar.gz",
-        ],
-        sha256 = "2852b21f93ee84185a9d838809c9a9c41bf6deca741bed1744e0fdba6cc19e3f",
-        strip_prefix = "arrow-apache-arrow-10.0.0",
-        build_file = "@psi//bazel:arrow.BUILD",
-    )
-
-def _com_github_grpc_grpc():
-    maybe(
-        http_archive,
-        name = "com_github_grpc_grpc",
-        sha256 = "7f42363711eb483a0501239fd5522467b31d8fe98d70d7867c6ca7b52440d828",
-        strip_prefix = "grpc-1.51.0",
-        type = "tar.gz",
-        patch_args = ["-p1"],
-        patches = ["@psi//bazel/patches:grpc.patch"],
-        urls = [
-            "https://github.com/grpc/grpc/archive/refs/tags/v1.51.0.tar.gz",
-        ],
-    )
-
-def _com_github_nelhage_rules_boost():
-    # use boost 1.83
-    RULES_BOOST_COMMIT = "cfa585b1b5843993b70aa52707266dc23b3282d0"
-    maybe(
-        http_archive,
-        name = "com_github_nelhage_rules_boost",
-        sha256 = "a7c42df432fae9db0587ff778d84f9dc46519d67a984eff8c79ae35e45f277c1",
-        strip_prefix = "rules_boost-%s" % RULES_BOOST_COMMIT,
-        patch_args = ["-p1"],
-        patches = ["@psi//bazel/patches:boost.patch"],
-        urls = [
-            "https://github.com/nelhage/rules_boost/archive/%s.tar.gz" % RULES_BOOST_COMMIT,
-        ],
-    )
-
-def _com_github_tencent_rapidjson():
-    maybe(
-        http_archive,
-        name = "com_github_tencent_rapidjson",
-        urls = [
-            "https://github.com/Tencent/rapidjson/archive/refs/tags/v1.1.0.tar.gz",
-        ],
-        sha256 = "bf7ced29704a1e696fbccf2a2b4ea068e7774fa37f6d7dd4039d0787f8bed98e",
-        strip_prefix = "rapidjson-1.1.0",
-        build_file = "@psi//bazel:rapidjson.BUILD",
-    )
-
-def _com_github_xtensor_xsimd():
-    maybe(
-        http_archive,
-        name = "com_github_xtensor_xsimd",
-        urls = [
-            "https://codeload.github.com/xtensor-stack/xsimd/tar.gz/refs/tags/8.1.0",
-        ],
-        sha256 = "d52551360d37709675237d2a0418e28f70995b5b7cdad7c674626bcfbbf48328",
-        type = "tar.gz",
-        strip_prefix = "xsimd-8.1.0",
-        build_file = "@psi//bazel:xsimd.BUILD",
-    )
-
-def _brotli():
-    maybe(
-        http_archive,
-        name = "brotli",
-        build_file = "@psi//bazel:brotli.BUILD",
-        sha256 = "e720a6ca29428b803f4ad165371771f5398faba397edf6778837a18599ea13ff",
-        strip_prefix = "brotli-1.1.0",
-        urls = [
-            "https://github.com/google/brotli/archive/refs/tags/v1.1.0.tar.gz",
-        ],
-    )
-
-def _com_github_lz4_lz4():
-    maybe(
-        http_archive,
-        name = "com_github_lz4_lz4",
-        urls = [
-            "https://codeload.github.com/lz4/lz4/tar.gz/refs/tags/v1.9.3",
-        ],
-        sha256 = "030644df4611007ff7dc962d981f390361e6c97a34e5cbc393ddfbe019ffe2c1",
-        type = "tar.gz",
-        strip_prefix = "lz4-1.9.3",
-        build_file = "@psi//bazel:lz4.BUILD",
-    )
-
-def _org_apache_thrift():
-    maybe(
-        http_archive,
-        name = "org_apache_thrift",
-        build_file = "@psi//bazel:thrift.BUILD",
-        sha256 = "31e46de96a7b36b8b8a457cecd2ee8266f81a83f8e238a9d324d8c6f42a717bc",
-        strip_prefix = "thrift-0.21.0",
-        urls = [
-            "https://github.com/apache/thrift/archive/v0.21.0.tar.gz",
-        ],
-    )
-
-def _com_google_double_conversion():
-    maybe(
-        http_archive,
-        name = "com_google_double_conversion",
-        sha256 = "04ec44461850abbf33824da84978043b22554896b552c5fd11a9c5ae4b4d296e",
-        strip_prefix = "double-conversion-3.3.0",
-        build_file = "@psi//bazel:double-conversion.BUILD",
-        urls = [
-            "https://github.com/google/double-conversion/archive/refs/tags/v3.3.0.tar.gz",
-        ],
-    )
-
-def _bzip2():
-    maybe(
-        http_archive,
-        name = "bzip2",
-        build_file = "@psi//bazel:bzip2.BUILD",
-        sha256 = "ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269",
-        strip_prefix = "bzip2-1.0.8",
-        urls = [
-            "https://sourceware.org/pub/bzip2/bzip2-1.0.8.tar.gz",
-        ],
-    )
-
-def _com_github_google_snappy():
-    maybe(
-        http_archive,
-        name = "com_github_google_snappy",
-        urls = [
-            "https://github.com/google/snappy/archive/refs/tags/1.1.9.tar.gz",
-        ],
-        sha256 = "75c1fbb3d618dd3a0483bff0e26d0a92b495bbe5059c8b4f1c962b478b6e06e7",
-        strip_prefix = "snappy-1.1.9",
-        build_file = "@psi//bazel:snappy.BUILD",
-    )
-
 def _com_github_google_perfetto():
     maybe(
         http_archive,
-        name = "com_github_google_perfetto",
+        name = "perfetto",
         urls = [
             "https://github.com/google/perfetto/archive/refs/tags/v41.0.tar.gz",
         ],
@@ -367,7 +140,7 @@ def _com_github_google_perfetto():
 def _com_github_floodyberry_curve25519_donna():
     maybe(
         http_archive,
-        name = "com_github_floodyberry_curve25519_donna",
+        name = "curve25519-donna",
         strip_prefix = "curve25519-donna-2fe66b65ea1acb788024f40a3373b8b3e6f4bbb2",
         sha256 = "ba57d538c241ad30ff85f49102ab2c8dd996148456ed238a8c319f263b7b149a",
         type = "tar.gz",
@@ -377,30 +150,6 @@ def _com_github_floodyberry_curve25519_donna():
         ],
     )
 
-def _com_github_ridiculousfish_libdivide():
-    maybe(
-        http_archive,
-        name = "com_github_ridiculousfish_libdivide",
-        urls = [
-            "https://github.com/ridiculousfish/libdivide/archive/refs/tags/5.0.tar.gz",
-        ],
-        sha256 = "01ffdf90bc475e42170741d381eb9cfb631d9d7ddac7337368bcd80df8c98356",
-        strip_prefix = "libdivide-5.0",
-        build_file = "@psi//bazel:libdivide.BUILD",
-    )
-
-def _com_github_sparsehash_sparsehash():
-    maybe(
-        http_archive,
-        name = "com_github_sparsehash_sparsehash",
-        urls = [
-            "https://github.com/sparsehash/sparsehash/archive/refs/tags/sparsehash-2.0.4.tar.gz",
-        ],
-        sha256 = "8cd1a95827dfd8270927894eb77f62b4087735cbede953884647f16c521c7e58",
-        strip_prefix = "sparsehash-sparsehash-2.0.4",
-        build_file = "@psi//bazel:sparsehash.BUILD",
-    )
-
 def _com_github_zeromq_cppzmq():
     maybe(
         http_archive,
diff --git a/bazel/seal.BUILD b/bazel/seal.BUILD
index e933044..92fc017 100644
--- a/bazel/seal.BUILD
+++ b/bazel/seal.BUILD
@@ -37,11 +37,11 @@ cmake(
         # "SEAL_USE_INTEL_HEXL": "ON",
     },
     generate_args = ["-GNinja"],
-    lib_source = "@com_github_microsoft_seal//:all",
+    lib_source = "@seal//:all",
     out_include_dir = "include/SEAL-4.1",
     out_static_libs = ["libseal-4.1.a"],
     deps = [
-        "@com_github_facebook_zstd//:zstd",
+        "@zstd",
         "@com_github_microsoft_gsl//:Microsoft.GSL",
         "@zlib",
         # Uncomment to use hexl
diff --git a/bazel/snappy.BUILD b/bazel/snappy.BUILD
deleted file mode 100644
index 419b694..0000000
--- a/bazel/snappy.BUILD
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2023 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "all_srcs",
-    srcs = glob(["**"]),
-)
-
-cmake(
-    name = "snappy",
-    cache_entries = {
-        "SNAPPY_BUILD_TESTS": "OFF",
-        "SNAPPY_BUILD_BENCHMARKS": "OFF",
-        "CMAKE_INSTALL_LIBDIR": "lib",
-    },
-    generate_crosstool_file = False,
-    install_args = [
-        "--prefix $${INSTALLDIR}",
-    ],
-    lib_source = ":all_srcs",
-    out_static_libs = [
-        "libsnappy.a",
-    ],
-)
diff --git a/bazel/thrift.BUILD b/bazel/thrift.BUILD
deleted file mode 100644
index 5d1cc9b..0000000
--- a/bazel/thrift.BUILD
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2023 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# copied from https://github.com/tensorflow/io/blob/v0.25.0/third_party/thrift.BUILD
-# Description:
-#   Apache Thrift library
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "thrift",
-    srcs = glob([
-        "lib/cpp/src/thrift/**/*.h",
-    ]) + [
-        "lib/cpp/src/thrift/protocol/TProtocol.cpp",
-        "lib/cpp/src/thrift/transport/TBufferTransports.cpp",
-        "lib/cpp/src/thrift/transport/TTransportException.cpp",
-    ],
-    hdrs = [
-        "compiler/cpp/src/thrift/version.h",
-        "lib/cpp/src/thrift/config.h",
-    ],
-    includes = [
-        "lib/cpp/src",
-    ],
-    textual_hdrs = [
-        "lib/cpp/src/thrift/protocol/TBinaryProtocol.tcc",
-        "lib/cpp/src/thrift/protocol/TCompactProtocol.tcc",
-    ],
-    deps = [
-        "@boost//:units",
-    ],
-)
-
-genrule(
-    name = "config_h",
-    srcs = ["build/cmake/config.h.in"],
-    outs = ["lib/cpp/src/thrift/config.h"],
-    cmd = ("sed " +
-           "-e 's/cmakedefine/define/g' " +
-           "-e 's/$${PACKAGE}/thrift/g' " +
-           "-e 's/$${PACKAGE_BUGREPORT}//g' " +
-           "-e 's/$${PACKAGE_NAME}/thrift/g' " +
-           "-e 's/$${PACKAGE_TARNAME}/thrift/g' " +
-           "-e 's/$${PACKAGE_URL}//g' " +
-           "-e 's/$${PACKAGE_VERSION}/0.12.0/g' " +
-           "-e 's/$${PACKAGE_STRING}/thrift 0.12.0/g' " +
-           "$< >$@"),
-)
diff --git a/bazel/xsimd.BUILD b/bazel/xsimd.BUILD
deleted file mode 100644
index 4c3361c..0000000
--- a/bazel/xsimd.BUILD
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2023 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# copied from https://github.com/tensorflow/io/blob/v0.25.0/third_party/xsimd.BUILD
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # BSD 3-Clause
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "xsimd",
-    srcs = [],
-    hdrs = glob(
-        [
-            "include/xsimd/*.hpp",
-            "include/xsimd/config/*.hpp",
-            "include/xsimd/math/*.hpp",
-            "include/xsimd/memory/*.hpp",
-            "include/xsimd/stl/*.hpp",
-            "include/xsimd/types/*.hpp",
-        ],
-        exclude = [
-        ],
-    ),
-    copts = [],
-    defines = [],
-    includes = [
-        "include",
-    ],
-    linkopts = [],
-    visibility = ["//visibility:public"],
-    deps = [
-    ],
-)
diff --git a/bazel/zstd.BUILD b/bazel/zstd.BUILD
index 2efa3c6..bf77f9f 100644
--- a/bazel/zstd.BUILD
+++ b/bazel/zstd.BUILD
@@ -32,7 +32,7 @@ cmake(
         "CMAKE_INSTALL_LIBDIR": "lib",
     },
     generate_args = ["-GNinja"],
-    lib_source = "@com_github_facebook_zstd//:all",
+    lib_source = "@zstd//:all",
     out_include_dir = "include/",
     out_static_libs = ["libzstd.a"],
     working_directory = "build/cmake",
diff --git a/benchmark/Makefile b/benchmark/Makefile
index 50569ce..7f80eaa 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -35,5 +35,5 @@ analysis:
 	docker logs $(DOCKER_PROJ_NAME)-psi-sender-1 > $(PWD)/docker-compose/logs/sender/psi.log
 	docker logs $(DOCKER_PROJ_NAME)-psi-receiver-1 > $(PWD)/docker-compose/logs/receiver/psi.log
 
-start-docker:
+start-docker: 
 	@(cd $(PWD)/docker-compose && docker compose -p ${DOCKER_PROJ_NAME} up -d)
diff --git a/benchmark/docker-compose/setup_wan.sh b/benchmark/docker-compose/setup_wan.sh
index 3a34e27..cc4b1c0 100644
--- a/benchmark/docker-compose/setup_wan.sh
+++ b/benchmark/docker-compose/setup_wan.sh
@@ -3,3 +3,4 @@ set -eu
 yum install iproute-tc -y;
 tc qdisc add dev eth0 root handle 1: tbf rate 100mbit burst 128kb latency 10ms;
 tc qdisc add dev eth0 parent 1:1 handle 10: netem delay 10msec limit 8000
+
diff --git a/benchmark/plot_csv_data.py b/benchmark/plot_csv_data.py
index 305a0b4..e33f2e7 100644
--- a/benchmark/plot_csv_data.py
+++ b/benchmark/plot_csv_data.py
@@ -21,7 +21,9 @@
 def plot_cpu(docker_csv_path, output_path):
     df1 = pd.read_csv(docker_csv_path)
 
-    plt.plot(df1["running_time_s"], df1["cpu_percent"], marker="o", linestyle="-", color="b")
+    plt.plot(
+        df1["running_time_s"], df1["cpu_percent"], marker="o", linestyle="-", color="b"
+    )
     max_time_count = 10
     interval = 1
     if len(df1) > max_time_count:
@@ -44,10 +46,13 @@ def plot_cpu(docker_csv_path, output_path):
     plt.savefig(output_path)
     plt.clf()
 
+
 def plot_mem(docker_csv_path, output_path):
     df1 = pd.read_csv(docker_csv_path)
 
-    plt.plot(df1["running_time_s"], df1["mem_usage_MB"], marker="o", linestyle="-", color="b")
+    plt.plot(
+        df1["running_time_s"], df1["mem_usage_MB"], marker="o", linestyle="-", color="b"
+    )
     max_time_count = 10
     interval = 1
     if len(df1) > max_time_count:
@@ -70,11 +75,16 @@ def plot_mem(docker_csv_path, output_path):
     plt.savefig(output_path)
     plt.clf()
 
+
 def plot_net(docker_csv_path, output_path):
     df1 = pd.read_csv(docker_csv_path)
 
-    plt.plot(df1["running_time_s"], df1["net_tx_kb"], marker="o", linestyle="-", color="b")
-    plt.plot(df1["running_time_s"], df1["net_rx_kb"], marker="*", linestyle="-", color="y")
+    plt.plot(
+        df1["running_time_s"], df1["net_tx_kb"], marker="o", linestyle="-", color="b"
+    )
+    plt.plot(
+        df1["running_time_s"], df1["net_rx_kb"], marker="*", linestyle="-", color="y"
+    )
     max_time_count = 10
     interval = 1
     if len(df1) > max_time_count:
diff --git a/benchmark/stats.py b/benchmark/stats.py
index 84b1040..0780167 100644
--- a/benchmark/stats.py
+++ b/benchmark/stats.py
@@ -20,6 +20,7 @@
 import time
 from datetime import datetime
 
+
 def stream_container_stats(container_name, output_file):
     client = docker.from_env()
 
@@ -27,8 +28,16 @@ def stream_container_stats(container_name, output_file):
         container = client.containers.get(container_name)
         stats_stream = container.stats(stream=True)
 
-        with open(output_file, 'w', newline='') as csvfile:
-            fieldnames = ['cpu_percent', 'mem_usage_MB', 'mem_limit_MB', 'net_tx_kb', 'net_rx_kb', 'running_time_s', 'time']
+        with open(output_file, "w", newline="") as csvfile:
+            fieldnames = [
+                "cpu_percent",
+                "mem_usage_MB",
+                "mem_limit_MB",
+                "net_tx_kb",
+                "net_rx_kb",
+                "running_time_s",
+                "time",
+            ]
             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 
             writer.writeheader()
@@ -40,36 +49,54 @@ def stream_container_stats(container_name, output_file):
             for stats in stats_stream:
                 data = json.loads(stats)
                 running_time_s = int(time.time()) - start_unix_time
-                cpu_percent = ((data['cpu_stats']['cpu_usage']['total_usage'] - prev_cpu_total) /
-                           (data['cpu_stats']['system_cpu_usage'] - prev_cpu_system)) * 100 * os.cpu_count()
-                mem_usage = (data['memory_stats']['usage'] - data['memory_stats']['stats']['inactive_file']) / 1024 / 1024
-                mem_limit = data['memory_stats']['limit'] / 1024 / 1024
+                cpu_percent = (
+                    (
+                        (data["cpu_stats"]["cpu_usage"]["total_usage"] - prev_cpu_total)
+                        / (data["cpu_stats"]["system_cpu_usage"] - prev_cpu_system)
+                    )
+                    * 100
+                    * os.cpu_count()
+                )
+                mem_usage = (
+                    (
+                        data["memory_stats"]["usage"]
+                        - data["memory_stats"]["stats"]["inactive_file"]
+                    )
+                    / 1024
+                    / 1024
+                )
+                mem_limit = data["memory_stats"]["limit"] / 1024 / 1024
                 net_tx = 0
                 net_rx = 0
-                for key, value in data['networks'].items():
-                    net_tx += value['tx_bytes'] / 1024
-                    net_rx += value['rx_bytes'] / 1024
+                for key, value in data["networks"].items():
+                    net_tx += value["tx_bytes"] / 1024
+                    net_rx += value["rx_bytes"] / 1024
                 # skip first five seconds, due to running setting up network
                 if running_time_s > 5:
-                    writer.writerow({
-                        'cpu_percent': cpu_percent,
-                        'mem_usage_MB': int(mem_usage),
-                        'mem_limit_MB': int(mem_limit),
-                        'net_tx_kb': int((net_tx - prev_net_tx) * 8),
-                        'net_rx_kb': int((net_rx - prev_net_rx) * 8),
-                        'running_time_s': running_time_s,
-                        'time': datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
-                    })
+                    writer.writerow(
+                        {
+                            "cpu_percent": cpu_percent,
+                            "mem_usage_MB": int(mem_usage),
+                            "mem_limit_MB": int(mem_limit),
+                            "net_tx_kb": int((net_tx - prev_net_tx) * 8),
+                            "net_rx_kb": int((net_rx - prev_net_rx) * 8),
+                            "running_time_s": running_time_s,
+                            "time": datetime.fromtimestamp(time.time()).strftime(
+                                "%H:%M:%S"
+                            ),
+                        }
+                    )
                 prev_net_tx = net_tx
                 prev_net_rx = net_rx
-                prev_cpu_total = data['cpu_stats']['cpu_usage']['total_usage']
-                prev_cpu_system = data['cpu_stats']['system_cpu_usage']
+                prev_cpu_total = data["cpu_stats"]["cpu_usage"]["total_usage"]
+                prev_cpu_system = data["cpu_stats"]["system_cpu_usage"]
 
     except docker.errors.NotFound:
         print(f"Container {container_name} not found.")
     except Exception as e:
-        if container.status != 'exited':
+        if container.status != "exited":
             print(f"An error occurred: {e} container.status: {container.status}")
 
+
 if __name__ == "__main__":
-    stream_container_stats(sys.argv[1], sys.argv[2])
\ No newline at end of file
+    stream_container_stats(sys.argv[1], sys.argv[2])
diff --git a/docs/development/psi_protocol_intro.rst b/docs/development/psi_protocol_intro.rst
index 72b637c..f5b8710 100644
--- a/docs/development/psi_protocol_intro.rst
+++ b/docs/development/psi_protocol_intro.rst
@@ -13,12 +13,12 @@ SecretFlow SPU implements the following PSI protocols,
 ECDH-PSI
 --------
 
-The semi-honest DH-PSI protocol is due to Huberman, Franklin, and Hogg [HFH99]_,
+The semi-honest DH-PSI protocol is due to Huberman, Franklin, and Hogg [HFH99]_, 
 but with roots as far back as Meadows [Mea86]_. It is a semi-honest protocol that
 requires exponentiations in a Diffie-Hellman group proportional to the number of items in the sets.
 
-As a general rule, OT-based PSI protocols are (significantly) faster but require more communication
-than Diffie-Hellman-based PSI protocols.
+As a general rule, OT-based PSI protocols are (significantly) faster but require more communication 
+than Diffie-Hellman-based PSI protocols. 
 In some scenarios, communication cost is overwhelmingly more important than computation cost.
 
 DH-PSI protocol based on the Decisional Diffie-Hellman assumption:
@@ -32,22 +32,22 @@ Curve25519 [Ber06]_ offer a good balance between security and performance.
 
 .. figure:: ../_static/dh_psi.png
 
-1. For each element :math:`x_i` in its set, Alice applies the hash function and then exponentiates it
-   using its key :math:`\alpha`, thus computing :math:`{H(x_i)}^\alpha` . Alice sends
+1. For each element :math:`x_i` in its set, Alice applies the hash function and then exponentiates it 
+   using its key :math:`\alpha`, thus computing :math:`{H(x_i)}^\alpha` . Alice sends 
    :math:`\{\{H(x_i)\}^\alpha\}_{i=1}^{n_1}` to Bob.
 
-2. For each element :math:`{H(x_i)}^\alpha`  received from Alice in the previous step, Bob exponentiates
-   it using its key :math:`\beta`, computing :math:`{H(x_i)}^{\alpha\beta}`.
+2. For each element :math:`{H(x_i)}^\alpha`  received from Alice in the previous step, Bob exponentiates 
+   it using its key :math:`\beta`, computing :math:`{H(x_i)}^{\alpha\beta}`. 
    Bob sends :math:`{\{\{H(x_i)\}^{\alpha\beta}\}}_{i=1}^{n_1}` to Alice.
 
-3. For each element :math:`y_i` in its set, Bob applies the hash function and then exponentiates it
-   using its key :math:`\beta`, thus computing :math:`{H(y_i)}^\beta` .
+3. For each element :math:`y_i` in its set, Bob applies the hash function and then exponentiates it 
+   using its key :math:`\beta`, thus computing :math:`{H(y_i)}^\beta` . 
    Bob sends the set :math:`\{\{H(y_i)\}^\beta\}_{i=1}^{n_2}` to Alice.
 
-4. For each element :math:`{H(y_i)}^\beta`  received from Bob in the previous step, Alice exponentiates
-   it using its key :math:`\alpha`, computing :math:`{H(y_i)}^{\beta\alpha}` .
+4. For each element :math:`{H(y_i)}^\beta`  received from Bob in the previous step, Alice exponentiates 
+   it using its key :math:`\alpha`, computing :math:`{H(y_i)}^{\beta\alpha}` .   
 
-5. Alice compares two set :math:`{\{\{H(x_i)\}^{\alpha\beta}\}}_{i=1}^{n_1}`
+5. Alice compares two set :math:`{\{\{H(x_i)\}^{\alpha\beta}\}}_{i=1}^{n_1}` 
    and :math:`{\{\{H(y_i)\}^{\beta\alpha}\}}_{i=1}^{n_2}`  and gets intersection.
 
 The Elliptic Curve groups, supported in secretflow SPU PSI module.
@@ -74,27 +74,27 @@ ECDH-PSI (3P)
 We implement our own three-party PSI protocol based on ECDH. Note that our implementation has known
 leakage, please use at your own risk.
 
-Assume Alice, Bob, Charlie (receiver) want to perform 3P PSI, in addition to the final output, our
+Assume Alice, Bob, Charlie (receiver) want to perform 3P PSI, in addition to the final output, our 
 protocol leaks the intersection size of Alice's data and Bob's data to Charlie.
 
 .. figure:: ../_static/dh_psi_3p.png
 
-Note that at the beginning of ECDH-PSI protocol, we assume the input data from both Alice and Charlie are
+Note that at the beginning of ECDH-PSI protocol, we assume the input data from both Alice and Charlie are 
 shuffled (It's not necessary to shuffle Bob's set).
 
 Protocol:
 
 1. For i-th element in its set, Alice calculates :math:`H(x_i)^\alpha` and sends to Bob.
 
-2. For i-th element, Bob calculates :math:`H(x_i)^{\alpha\beta}` and
+2. For i-th element, Bob calculates :math:`H(x_i)^{\alpha\beta}` and 
    :math:`H(y_i)^\beta`, then shuffles them randomly and sends them to Alice.
 
-3. For i-th element, Alice calculates :math:`H(y_i)^{\alpha\beta}` and gets the intersection of
-   :math:`H(x_i)^{\alpha\beta} \cap H(y_i)^{\alpha\beta}` (we denote the intersection as
+3. For i-th element, Alice calculates :math:`H(y_i)^{\alpha\beta}` and gets the intersection of 
+   :math:`H(x_i)^{\alpha\beta} \cap H(y_i)^{\alpha\beta}` (we denote the intersection as 
    :math:`I^{\alpha\beta}`), then sends :math:`I^{\alpha\beta}` to Charlie.
 
-4. For i-th element, Charlie sends :math:`H(z_i)^{\gamma}` to Bob, Bob calculates and sends to
-   Alice :math:`H(z_i)^{\beta\gamma}`, finally Alice calculates and sends to
+4. For i-th element, Charlie sends :math:`H(z_i)^{\gamma}` to Bob, Bob calculates and sends to 
+   Alice :math:`H(z_i)^{\beta\gamma}`, finally Alice calculates and sends to 
    Charlie :math:`H(z_i)^{\alpha\beta\gamma}`.
 
 5. Charlie calculates :math:`I^{\alpha\beta\gamma}` and compares :math:`I^{\alpha\beta\gamma}` with
@@ -103,9 +103,9 @@ Protocol:
 KKRT16-PSI
 ----------
 
-[KKRT16]_ is semi-honest OT-based PSI, based on OT Extension, BaRK-OPRF and CuckooHash.
-[KKRT16]_ is the first PSI protocol requiring only one minute for the case of larger sets
-( :math:`2^{24}` items each) of long strings (128 bits).
+[KKRT16]_ is semi-honest OT-based PSI, based on OT Extension, BaRK-OPRF and CuckooHash. 
+[KKRT16]_ is the first PSI protocol requiring only one minute for the case of larger sets 
+( :math:`2^{24}` items each) of long strings (128 bits). 
 
 We use 3-way stash-less CuckooHash proposed in [PSZ18]_.
 
@@ -127,20 +127,20 @@ Protocol. Our implementation bases on ECDH-PSI, and provides:
 
 - Differentially private PSI results.
 
-This feature is currently under test, please use at your own risk!
+This feature is currently under test, please use at your own risk!  
 
 Why PSI with differentially private results? If we want a scheme that protects
 both the private inputs and output privacy, an ideal way is to use `circuit
 PSI`, which is a typical PSI variant that allows secure computation (e.g. MPC or
 HE) on the PSI result without revealing it. `PSTY19
 <https://eprint.iacr.org/2019/241.pdf>`_ However those protocols are expensive
-in terms of efficiency.
+in terms of efficiency.  
 
 DP-PSI is a way of utilizing the up-sampling and sub-sampling mechanism to add
-calibrated noises to the PSI results, without revealing its concise value.
+calibrated noises to the PSI results, without revealing its concise value.  
 
 The protocol is listed below, assume Alice has a (hashed and shuffled) set
-:math:`X` and Bob has a (hashed and shuffled) :math:`Y`.
+:math:`X` and Bob has a (hashed and shuffled) :math:`Y`.  
 
 .. figure:: ../_static/dp_psi.png
 
@@ -151,33 +151,33 @@ Protocol:
 
 1. Alice and Bob first encrypts their own dataset, and gets :math:`X^a` and
    :math:`Y^b` separately.
-
+   
 2. Alice sends :math:`X^a` to Bob.
-
+   
 3. Bob performs random subsampling on :math:`Y^b`, gets :math:`Y_*^b` and sends it
    to Alice. In the meantime, on receiving :math:`X^a` from Alice, Bob
    re-encrypts it with :math:`b`, gets :math:`X^{ab}`. Then it samples a random
    permutation :math:`\pi` to permute Alice's set, and sends permuted
    :math:`\pi(X^{ab})` back to Alice.
-
+   
 4. On receiving :math:`Y_*^b` and :math:`\pi(X^{ab})` from Bob, Alice re-encrypts
    :math:`Y_*^b` and gets :math:`Y_*^{ab}`, then calculates the intersection
    :math:`I_*^{ab}\gets\pi(X^{ab})\cap Y_*^{ab}`.
-
+   
 5. Alice randomly subsamples the intersection, gets :math:`I_{**}^{ab}`, and
    then finds their corresponding index in :math:`Y_*^b`. Then randomly adds
    non-intersection index to this set.
-
+   
 6. Alice sends the index set to Bob, then Bob reveals the final results.
 
 In the end, this scheme ensures that the receiver (Bob) only learns the noised
 intersection, without the ability of pointing out whether an element is in the
-actual set intersection or not.
+actual set intersection or not.  
 
 Note that multiple invocations of DP-PSI inevitably weaken the privacy
 protection, therefore, we strongly suggest that user should implement a
 protection mechanism to prevent multiple DP-PSI executions on the same input
-value.
+value.  
 
 +---------------------------+--------+---------+---------+---------+-----------+
 | Intel(R) Xeon(R) Platinum | 2^20   | 2^21    | 2^22    | 2^23    |   2^24    |
@@ -194,31 +194,31 @@ Unbalanced PSI
 Ecdh-OPRF based PSI
 >>>>>>>>>>>>>>>>>>>
 
-[RA18]_ section 3 introduces Basic Unbalanced PSI(Ecdh-OPRF based) protocol proposed in [BBCD+11]_ that relaxes
+[RA18]_ section 3 introduces Basic Unbalanced PSI(Ecdh-OPRF based) protocol proposed in [BBCD+11]_ that relaxes 
 the security of the [JL10]_ to be secure against semi-honest adversaries. The protocol has two phases, the preprocessing phase and the online phase. The
 authors introduced many optimizations to push as much computation and communication cost to
 the preprocessing phase as possible.
 
-An Oblivious Pseudorandom Function (OPRF) is a two-party protocol between client and server for computing the
-output of a Pseudorandom Function (PRF). [draft-irtf-cfrg-voprf-10]_ specifies OPRF, VOPRF, and POPRF protocols
+An Oblivious Pseudorandom Function (OPRF) is a two-party protocol between client and server for computing the 
+output of a Pseudorandom Function (PRF). [draft-irtf-cfrg-voprf-10]_ specifies OPRF, VOPRF, and POPRF protocols 
 built upon prime-order groups.
 
 .. figure:: ../_static/ecdh_oprf_psi.jpg
 
 - Offline Phase
-
-  1. For each element :math:`y_i` in its set, Bob applies PRF using
-     private key :math:`\beta`, i.e. computing :math:`H_2(y_i,{H_1(y_i)}^\beta)` .
-
+  
+  1. For each element :math:`y_i` in its set, Bob applies PRF using 
+     private key :math:`\beta`, i.e. computing :math:`H_2(y_i,{H_1(y_i)}^\beta)` . 
+  
   2. Bob sends :math:`\{\{H_2(y_i,{H_1(y_i)}^\beta)\}\}_{i=1}^{n_2}` to Alice in shuffled order.
-
+   
 - Online Phase
-
-  1. For each element :math:`x_i` in its set, Alice applies the hash function and then exponentiates
-     it using its blind key :math:`r_i`, thus computing :math:`{H_1(x_i)}^{r_i}`. Alice sends
+  
+  1. For each element :math:`x_i` in its set, Alice applies the hash function and then exponentiates 
+     it using its blind key :math:`r_i`, thus computing :math:`{H_1(x_i)}^{r_i}`. Alice sends 
      :math:`\{\{H_1(x_i)\}^{r_i}\}_{i=1}^{n_1}` to Bob.
-  2. For each element :math:`H_1(x_i)^{r_i}` received from Alice in the previous step, Bob exponentiates
-     it using its key :math:`\beta`, computing :math:`{H_1(x_i)}^{r_i\beta}`.
+  2. For each element :math:`H_1(x_i)^{r_i}` received from Alice in the previous step, Bob exponentiates 
+     it using its key :math:`\beta`, computing :math:`{H_1(x_i)}^{r_i\beta}`. 
      Bob sends :math:`{\{\{H_1(x_i)\}^{\{r_i\}\beta}\}}_{i=1}^{n_1}` to Alice.
   3. Alice receives :math:`{\{\{H_1(x_i)\}^{r_i\beta}\}}_{i=1}^{n_1}` from Bob, and unblinds it using :math:`r_i`,
      gets :math:`\{\{\{H_1(x_i)\}^\beta\}\}_{i=1}^{n_1}`, computes OPRF :math:`\{\{H_2(x_i,{H_1(x_i)}^\beta)\}\}_{i=1}^{n_1}`.
@@ -228,17 +228,17 @@ built upon prime-order groups.
 Labeled PSI
 >>>>>>>>>>>
 
-Somewhat homomorphic encryption (SHE) can be used to build efficient (labeled) Private Set Intersection
-protocols in the unbalanced setting, where one of the sets is much larger than the other.
-[CMGD+21]_ introduces several optimizations and improvements to the protocols of
-[CLR17]_, [CHLR18]_, resulting in improved running time and improved communication complexity in the
+Somewhat homomorphic encryption (SHE) can be used to build efficient (labeled) Private Set Intersection 
+protocols in the unbalanced setting, where one of the sets is much larger than the other. 
+[CMGD+21]_ introduces several optimizations and improvements to the protocols of 
+[CLR17]_, [CHLR18]_, resulting in improved running time and improved communication complexity in the 
 sender's set size.
 
-Microsoft `APSI (Asymmetric PSI) <https://github.com/microsoft/APSI>`_  library provides a PSI functionality
-for asymmetric set sizes based on the latest [CMGD+21]_.  APSI uses the BFV([FV12]_) encryption scheme implemented
+Microsoft `APSI (Asymmetric PSI) <https://github.com/microsoft/APSI>`_  library provides a PSI functionality 
+for asymmetric set sizes based on the latest [CMGD+21]_.  APSI uses the BFV([FV12]_) encryption scheme implemented 
 in the Microsoft [SEAL]_ library.
 
-SecretFlow SPU wraps `APSI <https://github.com/microsoft/APSI>`_ library, can be used for
+SecretFlow SPU wraps `APSI <https://github.com/microsoft/APSI>`_ library, can be used for 
 
 - Unbalanced PSI
 - Malicious PSI
@@ -248,26 +248,26 @@ SecretFlow SPU wraps `APSI <https://github.com/microsoft/APSI>`_ library, can be
 .. figure:: ../_static/labeled_psi.png
 
 - Setup Phase
-
+  
   1. **Choose ItemParams**, TableParams, QueryParams, SEALParams.
-  2. **Sender's OPRF**: The sender samples a key :math:`\beta` for the OPRF, updates its items set
+  2. **Sender's OPRF**: The sender samples a key :math:`\beta` for the OPRF, updates its items set 
      to :math:`\{\{H_2(s_i,{H_1(s_i)}^\beta)\}\}_{s_i\in S}`.
   3. **Sender's Hashing**: Sender inserts all :math:`s_i\in S` into the sets :math:`\mathcal{B}[h_0(s_i)]`,
      :math:`\mathcal{B}[h_1(s_i)]` and :math:`\mathcal{B}[h_2(s_i)]`.
   4. **Splitting**: For each set :math:`\mathcal{B}[i]`, the sender splits it into bin bundles, denoted as
      :math:`\mathcal{B}[i,1]`, ..., :math:`\mathcal{B}[i,k]`.
-  5. **Computing Coeffcients**:
-
-     - **Matching Polynomial**: For each bin bundle :math:`\mathcal{B}[i,j]`, the sender computes the
+  5. **Computing Coeffcients**: 
+   
+     - **Matching Polynomial**: For each bin bundle :math:`\mathcal{B}[i,j]`, the sender computes the 
        matching polynomial over :math:`\mathbb{F}_t`.
-     - **Label Polynomial**: If the sender has labels associated with its set, then for each bin bundle
+     - **Label Polynomial**: If the sender has labels associated with its set, then for each bin bundle 
        :math:`\mathcal{B}[i,j]`, the sender interpolates the label polynomial over :math:`\mathbb{F}_t`.
-
+   
 - Intersection Phase
-
+  
   1. Receiver Encrypt :math:`r_i \in R`.
 
-     - **Receiver's OPRF**: Receiver and Sender run ecdh-OPRF protocol, get
+     - **Receiver's OPRF**: Receiver and Sender run ecdh-OPRF protocol, get 
        :math:`\{\{H_2(r_i,{H_1(r_i)}^\beta)\}\}_{r_i\in R}`.
      - **Receiver's CuckooHash**: Receiver performs cuckoo hashing on the set :math:`R` into CuckooTable C with m bins
        using h1; h2; h3 has the hash functions.
@@ -276,9 +276,9 @@ SecretFlow SPU wraps `APSI <https://github.com/microsoft/APSI>`_ library, can be
      - **Encrypt**: The receiver uses *FHE.Encrypt* to encrypt query powers and sends the ciphertexts to the sender.
 
   2. **Sender Homomorphically evaluate Matching Polynomial**: The sender receives the collection of
-     ciphertexts and homomorphically evaluates Matching Polynomial. If Labeled PSI is desired, Sender homomorphically evaluates
+     ciphertexts and homomorphically evaluates Matching Polynomial. If Labeled PSI is desired, Sender homomorphically evaluates 
      Label Polynomial. The sender sends evaluated ciphertexts to Receiver.
-  3. **Receiver Decrypt and Get result**: receiver receives and decrypts the matching ciphertexts, and labels
+  3. **Receiver Decrypt and Get result**: receiver receives and decrypts the matching ciphertexts, and labels 
      ciphertexts if needed, outputs the matching set and labels.
 
 Labeled PSI Parameters
@@ -325,15 +325,15 @@ RR22 Blazing Fast PSI
 [RS21]_ introduced an efficient PSI protocol based on OKVS and VOLE. [RR22]_ present significant improvements
 to the OKVS data structure along with new techniquesfor further reducing the communication overhead of [RS]21.
 
-Oblivous Key-Value Stores(OKVS) consists of algorithms Encode and Decode. Encode takes a list of key-value (k,v)
-pairs as input and returns an abstract data structure S. Decode takes such a data structure S and a key k' as
+Oblivous Key-Value Stores(OKVS) consists of algorithms Encode and Decode. Encode takes a list of key-value (k,v) 
+pairs as input and returns an abstract data structure S. Decode takes such a data structure S and a key k' as 
 input, and gives some output v'.
 
 Pseudorandom correlation generators(PCGs) allow for the efficient generation of
 oblivious transfer (OT) and vector oblivious linear evaluations (VOLE)
 with sublinear communication and concretely good computational overhead.
-PCG makes use of a so-called LPN-friendly errorcorrecting code.
-`secretflow/YACL <https://github.com/secretflow/yacl>`_  provides VOLE code implementation.
+PCG makes use of a so-called LPN-friendly errorcorrecting code. 
+`secretflow/YACL <https://github.com/secretflow/yacl>`_  provides VOLE code implementation. 
 LPN-friendly coeds now support [CRR21]_ silver codes(LDPC) and [BCGI+22]_ Expand-Accumulate Codes.
 Silver is Most efficient, but not recommended to use due to its security flaw.
 
@@ -341,7 +341,7 @@ Semi-honest Protocol:
 
 .. figure:: ../_static/rr22_psi.png
 
-1. The Receiver samples :math:`r \leftarrow \{0,1\}^\kappa` and computes
+1. The Receiver samples :math:`r \leftarrow \{0,1\}^\kappa` and computes 
    :math:`\vec{P} :=  \mathrm{Encode} (L,r)` where
    :math:`L := \{(H^{n*m}(x,r),H(x))|x \in X\}`.
 
@@ -349,11 +349,11 @@ Semi-honest Protocol:
    :math:`\vec{B}`, Receiver gets :math:`\vec{A}` and :math:`\vec{C}`, such that:
    :math:`\vec{C}=\Delta *\vec{A'}+\vec{B}`.
 
-3. Receiver sends :math:`r, \vec{A}=\vec{A'}+\vec{P}` to Sender. Sender defines
+3. Receiver sends :math:`r, \vec{A}=\vec{A'}+\vec{P}` to Sender. Sender defines 
    :math:`\vec{K}=\vec{B}+\Delta \cdot \vec{A}`.
 
-4. Sender sends :math:`Y'=H^{n*m}(\vec{Y},r)\cdot \vec{K}-\Delta \cdot H(\vec{Y})`
-   to the Receiver.
+4. Sender sends :math:`Y'=H^{n*m}(\vec{Y},r)\cdot \vec{K}-\Delta \cdot H(\vec{Y})` 
+   to the Receiver.  
 
 5. Receiver compares :math:`X'=H^{n*m}(\vec{X},r)\cdot \vec{C}` and :math:`Y'`, outputs
    intersection result :math:`X \cap Y`.
@@ -368,14 +368,14 @@ Reference
    OT extension and silent non-interactive secure computation. In ACM CCS 2019, pages 291–308.
    ACM Press, November 2019.
 
-.. [BCG+19b] E. Boyle, G. Couteau, N. Gilboa, Y. Ishai, L. Kohl, P. Rindal, and P. Scholl.
+.. [BCG+19b] E. Boyle, G. Couteau, N. Gilboa, Y. Ishai, L. Kohl, P. Rindal, and P. Scholl. 
    Efficient two-round OT extension and silent non-interactive secure computation. In ACM CCS 2019,
    pages 291–308. ACM Press, November 2019.
 
 .. [Ber06] Daniel J. Bernstein. Curve25519: new diffie-hellman speed records. In In Public
    Key Cryptography (PKC), Springer-Verlag LNCS 3958, page 2006, 2006. (Cited on page 4.)
 
-.. [BCGI+22] Elette Boyle, Geoffroy Couteau, Niv Gilboa, Yuval Ishai, Lisa Kohl, Nicolas Resch, Peter Scholl.
+.. [BCGI+22] Elette Boyle, Geoffroy Couteau, Niv Gilboa, Yuval Ishai, Lisa Kohl, Nicolas Resch, Peter Scholl. 
    Correlated Pseudorandomness from Expand-Accumulate Codes. Crypto2022.
 
 .. [BBCD+11] Baldi, P., Baronio, R., Cristofaro, E.D., Gasti, P., Tsudik, G.: Countering GATTACA:
@@ -393,25 +393,25 @@ Reference
    B.M., Evans, D., Malkin, T., Xu, D. (eds.) ACM CCS 2017. pp. 1243{1255. ACM Press (Oct / Nov 2017).
    https://doi.org/10.1145/3133956.3134061
 
-.. [CMGD+21] Kelong Cong, Radames Cruz Moreno, Mariana Botelho da Gama, Wei Dai, Ilia Iliashenko, Kim Laine,
-   Michael Rosenberg. Labeled PSI from Homomorphic Encryption with Reduced Computation and Communication
-   CCS'21: Proceedings of the 2021 ACM SIGSAC Conference on Computer and Communications SecurityNovember 2021
+.. [CMGD+21] Kelong Cong, Radames Cruz Moreno, Mariana Botelho da Gama, Wei Dai, Ilia Iliashenko, Kim Laine, 
+   Michael Rosenberg. Labeled PSI from Homomorphic Encryption with Reduced Computation and Communication 
+   CCS'21: Proceedings of the 2021 ACM SIGSAC Conference on Computer and Communications SecurityNovember 2021    
 
-.. [CRR21] Geoffroy Couteau, Peter Rindal, and Srinivasan Raghuraman. Silver: Silent VOLE and Oblivious Transfer
+.. [CRR21] Geoffroy Couteau, Peter Rindal, and Srinivasan Raghuraman. Silver: Silent VOLE and Oblivious Transfer 
    from Hardness of Decoding Structured LDPC Codes. Crypto2021.
 
 .. [DP-PSI] Differentially-Private PSI https://arxiv.org/pdf/2208.13249.pdf
 
-.. [FourQ] Costello, C., Longa, P.: Fourq: four-dimensional decompositions on a q-curve over the mersenne prime.
+.. [FourQ] Costello, C., Longa, P.: Fourq: four-dimensional decompositions on a q-curve over the mersenne prime. 
     Cryptology ePrint Archive, Report 2015/565 (2015), https://eprint.iacr.org/2015/565
 
-.. [FV12] Fan, J., Vercauteren, F.: Somewhat practical fully homomorphic encryption. Cryptology ePrint Archive,
+.. [FV12] Fan, J., Vercauteren, F.: Somewhat practical fully homomorphic encryption. Cryptology ePrint Archive, 
    Report 2012/144 (2012), http://eprint.iacr.org/2012/144.pdf
 
 .. [HFH99] Bernardo A. Huberman, Matt Franklin, and Tad Hogg. Enhancing privacy and trust in electronic
    communities. In ACM CONFERENCE ON ELECTRONIC COMMERCE. ACM, 1999.
 
-.. [ipp-crypto] https://github.com/intel/ipp-crypto/
+.. [ipp-crypto] https://github.com/intel/ipp-crypto/ 
 
 .. [JL10] Jarecki, S., Liu, X.: Fast Secure Computation of Set Intersection. In: SCN. LNCS,
    vol. 6280, pp. 418–435. Springer (2010)
@@ -425,17 +425,17 @@ Reference
 .. [PSZ18] B. Pinkas, T. Schneider, and M. Zohner. Scalable private set intersection based on ot extension.
    ACM Transactions on Privacy and Security (TOPS), 21(2):1–35, 2018.
 
-.. [RA18] Resende, A.C.D., Aranha, D.F.: Faster unbalanced private set intersection. In: Meiklejohn, S.,
-   Sako, K. (eds.) FC2018. LNCS, vol. 10957, pp. 203{221. Springer, Heidelberg (Feb / Mar 2018)
+.. [RA18] Resende, A.C.D., Aranha, D.F.: Faster unbalanced private set intersection. In: Meiklejohn, S., 
+   Sako, K. (eds.) FC2018. LNCS, vol. 10957, pp. 203{221. Springer, Heidelberg (Feb / Mar 2018)   
 
 .. [RR22] Srinivasan Raghuraman and Peter Rindal. Blazing Fast PSI from Improved OKVS and Subfield VOLE. CCS'22.
 
-.. [RRT23] Srinivasan Raghuraman, Peter Rindal, Titouan Tanguy. Expand-Convolute Codes for Pseudorandom
+.. [RRT23] Srinivasan Raghuraman, Peter Rindal, Titouan Tanguy. Expand-Convolute Codes for Pseudorandom 
    Correlation Generators from LPN. Crypto2023.
 
 .. [RS21] Peter Rindal and Phillipp Schoppmann. VOLE-PSI: fast OPRF and circuit-psi from vector-ole. EUROCRYPT2021.
 
-.. [SEAL] Microsoft SEAL (release 4.0). https://github.com/Microsoft/SEAL (Sep 2022),
+.. [SEAL] Microsoft SEAL (release 4.0). https://github.com/Microsoft/SEAL (Sep 2022), 
    microsoft Research, Redmond, WA.
 
 .. [SEC2-v2] Standards for Efficient Cryptography (SEC) <http://www.secg.org/sec2-v2.pdf>
@@ -447,5 +447,5 @@ Reference
    zero-knowledge proofs for boolean and arithmetic circuits. In 2021 IEEE Symposium on Security
    and Privacy (SP), pages 1074–1091. IEEE, 2021.
 
-.. [draft-irtf-cfrg-voprf-10] Oblivious Pseudorandom Functions (OPRFs) using Prime-Order Groups.
-   https://www.ietf.org/archive/id/draft-irtf-cfrg-voprf-10.html
+.. [draft-irtf-cfrg-voprf-10] Oblivious Pseudorandom Functions (OPRFs) using Prime-Order Groups. 
+   https://www.ietf.org/archive/id/draft-irtf-cfrg-voprf-10.html   
diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index 997f512..d59331a 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -20,7 +20,7 @@ Welcome to SecretFlow PSI Library. There are multiple methods to use PSI/PIR.
 For PSI, we have a developing v2 PSI.
 
 +------------------------+------------------------------------------------+---------------------------------------------+
-|                        | PSI v1 APIs                                    | PSI v2 APIs                                 |
+|                        | PSI v1 APIs                                    | PSI v2 APIs                                 | 
 +========================+================================================+=============================================+
 | Supported Protocols    | ECDH, KKRT, ECDH_OPRF_UB, DP_PSI, RR22         | ECDH, KKRT, RR22, ECDH_OPRF_UB              |
 +------------------------+------------------------------------------------+---------------------------------------------+
@@ -133,7 +133,7 @@ We use the same dev docker from secretflow/ubuntu-base-ci::
          --entrypoint="bash" \
          secretflow/ubuntu-base-ci:latest
 
-
+    
     # attach to build container
     docker exec -it psi-dev-$(whoami) bash
 
@@ -152,7 +152,7 @@ You need to install:
 * xxd
 * lld
 
-For bazel, please check version in `.bazelversion <https://github.com/secretflow/psi/blob/main/.bazelversion>`_ or use bazelisk instead.
+For bazel, please check version in `.bazeliskrc <https://github.com/secretflow/psi/blob/main/.bazeliskrc>`_ or use bazelisk instead.
 
 Build & UnitTest
 ^^^^^^^^^^^^^^^^
diff --git a/docs/reference/launch_config.md b/docs/reference/launch_config.md
index 565ae1b..5563fa5 100644
--- a/docs/reference/launch_config.md
+++ b/docs/reference/launch_config.md
@@ -9,7 +9,7 @@ Please check psi.v2.PsiConfig and psi.v2.UbPsiConfig at **PSI v2 Configuration**
 
 - Messages
     - [LaunchConfig](#launchconfig)
-
+  
 
 
 
@@ -20,7 +20,7 @@ Please check psi.v2.PsiConfig and psi.v2.UbPsiConfig at **PSI v2 Configuration**
     - [PartyProto](#partyproto)
     - [RetryOptionsProto](#retryoptionsproto)
     - [SSLOptionsProto](#ssloptionsproto)
-
+  
 
 
 
@@ -163,3 +163,4 @@ SSL options.
 | <div><h4 id="bool" /></div><a name="bool" /> bool |  | bool | boolean | boolean |
 | <div><h4 id="string" /></div><a name="string" /> string | A string must always contain UTF-8 encoded or 7-bit ASCII text. | string | String | str/unicode |
 | <div><h4 id="bytes" /></div><a name="bytes" /> bytes | May contain any arbitrary sequence of bytes. | string | ByteString | str |
+
diff --git a/docs/reference/pir_config.md b/docs/reference/pir_config.md
index 68cf9fe..5dc2498 100644
--- a/docs/reference/pir_config.md
+++ b/docs/reference/pir_config.md
@@ -8,7 +8,7 @@
     - [ApsiReceiverConfig](#apsireceiverconfig)
     - [ApsiSenderConfig](#apsisenderconfig)
     - [PirResultReport](#pirresultreport)
-
+  
 
 
 
@@ -100,3 +100,4 @@ The report of pir task.
 | <div><h4 id="bool" /></div><a name="bool" /> bool |  | bool | boolean | boolean |
 | <div><h4 id="string" /></div><a name="string" /> string | A string must always contain UTF-8 encoded or 7-bit ASCII text. | string | String | str/unicode |
 | <div><h4 id="bytes" /></div><a name="bytes" /> bytes | May contain any arbitrary sequence of bytes. | string | ByteString | str |
+
diff --git a/docs/reference/psi_config.md b/docs/reference/psi_config.md
index 8c6f7d1..dd7b8c4 100644
--- a/docs/reference/psi_config.md
+++ b/docs/reference/psi_config.md
@@ -11,13 +11,13 @@
     - [MemoryPsiConfig](#memorypsiconfig)
     - [OutputParams](#outputparams)
     - [PsiResultReport](#psiresultreport)
-
+  
 
 
 - Enums
     - [CurveType](#curvetype)
     - [PsiType](#psitype)
-
+  
 
 
 - [Scalar Value Types](#scalar-value-types)
@@ -198,3 +198,4 @@ The algorithm type of psi.
 | <div><h4 id="bool" /></div><a name="bool" /> bool |  | bool | boolean | boolean |
 | <div><h4 id="string" /></div><a name="string" /> string | A string must always contain UTF-8 encoded or 7-bit ASCII text. | string | String | str/unicode |
 | <div><h4 id="bytes" /></div><a name="bytes" /> bytes | May contain any arbitrary sequence of bytes. | string | ByteString | str |
+
diff --git a/docs/reference/psi_v2_config.md b/docs/reference/psi_v2_config.md
index ff21ffe..4fcf189 100644
--- a/docs/reference/psi_v2_config.md
+++ b/docs/reference/psi_v2_config.md
@@ -16,7 +16,7 @@
     - [RecoveryConfig](#recoveryconfig)
     - [Rr22Config](#rr22config)
     - [UbPsiConfig](#ubpsiconfig)
-
+  
 
 
 - Enums
@@ -26,7 +26,7 @@
     - [RecoveryCheckpoint.Stage](#recoverycheckpointstage)
     - [Role](#role)
     - [UbPsiConfig.Mode](#ubpsiconfigmode)
-
+  
 
 
 - [Scalar Value Types](#scalar-value-types)
@@ -466,3 +466,4 @@ Role of parties.
 | <div><h4 id="bool" /></div><a name="bool" /> bool |  | bool | boolean | boolean |
 | <div><h4 id="string" /></div><a name="string" /> string | A string must always contain UTF-8 encoded or 7-bit ASCII text. | string | String | str/unicode |
 | <div><h4 id="bytes" /></div><a name="bytes" /> bytes | May contain any arbitrary sequence of bytes. | string | ByteString | str |
+
diff --git a/docs/user_guide/apsi_benchmark.md b/docs/user_guide/apsi_benchmark.md
index b62f0c7..e7ebca0 100644
--- a/docs/user_guide/apsi_benchmark.md
+++ b/docs/user_guide/apsi_benchmark.md
@@ -19,9 +19,9 @@ To measure the performance of APSI protocols under different data scales, we nee
 
 
 ```python
-# one million key-value pairs, each value's length is 32-byte,
+# one million key-value pairs, each value's length is 32-byte, 
 python examples/pir/apsi/test_data_creator.py --sender_size=1000000 --receiver_size=1 --intersection_size=1 --label_byte_count=32
-# 16 million key-value pairs, each value's length is 32-byte,
+# 16 million key-value pairs, each value's length is 32-byte, 
 python examples/pir/apsi/test_data_creator.py --sender_size=16000000 --receiver_size=1 --intersection_size=1 --label_byte_count=32
 ```
 
@@ -134,10 +134,10 @@ docker start apsi_sender
 docker exec -it apsi_sender bash
 ```
 
-Then run:
+Then run: 
 
 ```bash
-# offline
+# offline 
 ./main --config $(pwd)/examples/pir/config/apsi_sender_setup.json
 # online
 ./main --config $(pwd)/examples/pir/config/apsi_sender_online.json
@@ -216,3 +216,5 @@ If you wish to measure the APSI performance for a specific data scale and label
 
 
 Note that the above data does not represent the optimal performance of APSI. Under fixed data scale conditions, the query performance of APSI is highly correlated with the corresponding parameters. Additionally, if you want to support larger datasets, such as one billion data entries, we also offer a bucket mode. However, this mode requires consideration of more parameters, so it is not displayed in this benchmark.
+
+
diff --git a/docs/user_guide/faq.md b/docs/user_guide/faq.md
index 0d479c4..bfaa6e9 100644
--- a/docs/user_guide/faq.md
+++ b/docs/user_guide/faq.md
@@ -88,3 +88,4 @@ If a PSI task fails, just restart the task with the same config, the progress wi
 3. What is **Easy PSI**? Why and when to use **Easy PSI**?
 
 [Easy PSI](https://www.secretflow.org.cn/zh-CN/docs/easy-psi) is a standalone PSI product powered by this library. It provides a simple User Interface and utilize [Kuscia](https://www.secretflow.org.cn/docs/kuscia) to launch PSI binaries between both parties.
+
diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst
index ceabfc7..af79ae4 100644
--- a/docs/user_guide/index.rst
+++ b/docs/user_guide/index.rst
@@ -12,3 +12,4 @@ PSI v2 is recommended to use. We are still working on PIR code refactoring.
    faq
    psi_v2_benchmark
    apsi_benchmark
+
diff --git a/docs/user_guide/pir.rst b/docs/user_guide/pir.rst
index 695bc71..d67ee86 100644
--- a/docs/user_guide/pir.rst
+++ b/docs/user_guide/pir.rst
@@ -47,9 +47,9 @@ If you want to try a similar CLI like APSI, you could compile the source code by
 
 .. code-block::
 
-    bazel build psi/apsi_wrapper/cli:receiver
+    bazel build psi/wrapper/apsi/cli:receiver
 
-    bazel build psi/apsi_wrapper/cli:sender
+    bazel build psi/wrapper/apsi/cli:sender
 
 
 And run CLI like
@@ -57,9 +57,9 @@ And run CLI like
 
 .. code-block::
 
-    ./bazel-bin/psi/apsi_wrapper/cli/sender
+    ./bazel-bin/psi/wrapper/apsi/cli/sender
 
-    ./bazel-bin/psi/apsi_wrapper/cli/receiver
+    ./bazel-bin/psi/wrapper/apsi/cli/receiver
 
 
 Prepare data and config
@@ -113,7 +113,7 @@ PIR Config
 """"""""""
 
 1. Sender: Setup Stage. In this stage, sender generates sender db file with csv file. This stage is offline.
-Since version **0.4.0b0**, the source csv file for db generating should be specified as **source_file**, and **db_file**
+Since version **0.4.0b0**, the source csv file for db generating should be specified as **source_file**, and **db_file** 
 is used to specify db file.
 
 .. code-block::
@@ -145,7 +145,7 @@ is used to specify db file.
                 {
                     "id": "receiver",
                     "host": "127.0.0.1:5400"
-
+                
 
 .. code-block::
    :caption: apsi_sender_setup.json
diff --git a/docs/user_guide/psi.rst b/docs/user_guide/psi.rst
index c41cd3f..af9b6ce 100644
--- a/docs/user_guide/psi.rst
+++ b/docs/user_guide/psi.rst
@@ -7,9 +7,9 @@ Quick start with Private Set Intersection (PSI) V1 APIs.
 Supported Protocols
 ----------------------
 
-The :psi_code_host:`ECDH-PSI </blob/master/psi/ecdh/ecdh_psi.h>` is favorable if the bandwidth is the bottleneck.
+The :psi_code_host:`ECDH-PSI </blob/master/psi/algorithm/ecdh/ecdh_psi.h>` is favorable if the bandwidth is the bottleneck.
 If the computing is the bottleneck, you should try the BaRK-OPRF based
-PSI :psi_code_host:`KKRT-PSI </blob/master/psi/kkrt/kkrt_psi.h>`.
+PSI :psi_code_host:`KKRT-PSI </blob/master/psi/algorithm/kkrt/kkrt_psi.h>`.
 
 +---------------+--------------+--------------+--------------+
 | PSI protocols | Threat Model | Party Number |  PsiTypeCode |
@@ -27,11 +27,11 @@ PSI :psi_code_host:`KKRT-PSI </blob/master/psi/kkrt/kkrt_psi.h>`.
 | `DP-PSI`_     | Semi-Honest  | 2P           |   -          |
 +---------------+--------------+--------------+--------------+
 
-MPC and PSI protocols are designed for specific Security model (or Threat Models).
+MPC and PSI protocols are designed for specific Security model (or Threat Models). 
 
-Security model are widely considered to capture the capabilities of adversaries.
+Security model are widely considered to capture the capabilities of adversaries. 
 Adversaries of semi-honest model and malicious model are Semi-honest Adversary and
-Malicious Adversary.
+Malicious Adversary. 
 
 - `Semi-honest Adversary <https://wiki.mpcalliance.org/semi_honest_adversary.html>`_
 - `Malicious Adversary <https://wiki.mpcalliance.org/malicious_adversary.html>`_
@@ -92,12 +92,12 @@ Then use binary with::
 Benchmark
 ----------
 
-benchmark result without data load time
+benchmark result without data load time 
 
 ECDH PSI Benchmark
 >>>>>>>>>>>>>>>>>>
 
-:psi_code_host:`DH-PSI benchmark code <blob/main/psi/ecdh/ecdh_psi_benchmark.cc>`
+:psi_code_host:`DH-PSI benchmark code <blob/main/psi/algorithm/ecdh/ecdh_psi_benchmark.cc>`
 
 cpu limited by docker(--cpu)
 
@@ -127,7 +127,7 @@ cpu limited by docker(--cpu)
 KKRT PSI Benchmark
 >>>>>>>>>>>>>>>>>>>
 
-All of our experiments use a single thread for each party.
+All of our experiments use a single thread for each party. 
 
 If the bandwidth is enough, the upstream could try to perform multi-threading optimizations
 
@@ -161,6 +161,6 @@ Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz
 Security Tips
 -------------
 
-Warning:  `KKRT16 <https://eprint.iacr.org/2016/799.pdf>`_ is semi-honest PSI protocols,
+Warning:  `KKRT16 <https://eprint.iacr.org/2016/799.pdf>`_ is semi-honest PSI protocols, 
 and may be attacked in malicious model.
 We recommend using KKRT16 PSI protocol as one-way PSI, i.e., one party gets the final intersection result.
diff --git a/docs/user_guide/psi_v2.rst b/docs/user_guide/psi_v2.rst
index fd0987e..3b75ddd 100644
--- a/docs/user_guide/psi_v2.rst
+++ b/docs/user_guide/psi_v2.rst
@@ -117,7 +117,7 @@ To launch PSI, please check LaunchConfig at :doc:`/reference/launch_config` and
 You need to prepare following files:
 
 +------------------------+------------------------------------------------+-------------------------------------------------------------------------------+
-| File Name              | Location                                       | Description                                                                   |
+| File Name              | Location                                       | Description                                                                   | 
 +========================+================================================+===============================================================================+
 | receiver.config        | /tmp/receiver/receiver.config                  | Config for receiver.                                                          |
 +------------------------+------------------------------------------------+-------------------------------------------------------------------------------+
diff --git a/psi/legacy/kmprt17_mp_psi/BUILD.bazel b/experiment/psi/kmprt17_mp_psi/BUILD.bazel
similarity index 97%
rename from psi/legacy/kmprt17_mp_psi/BUILD.bazel
rename to experiment/psi/kmprt17_mp_psi/BUILD.bazel
index c08ce10..be525bf 100644
--- a/psi/legacy/kmprt17_mp_psi/BUILD.bazel
+++ b/experiment/psi/kmprt17_mp_psi/BUILD.bazel
@@ -32,7 +32,7 @@ psi_cc_library(
         "//psi/utils:communication",
         "//psi/utils:sync",
         "//psi/utils:test_utils",
-        "@com_google_absl//absl/types:span",
+        "@abseil-cpp//absl/types:span",
         "@yacl//yacl/base:exception",
         "@yacl//yacl/base:int128",
         "@yacl//yacl/crypto/hash:hash_utils",
diff --git a/psi/legacy/kmprt17_mp_psi/kmprt17_hashing.cc b/experiment/psi/kmprt17_mp_psi/kmprt17_hashing.cc
similarity index 97%
rename from psi/legacy/kmprt17_mp_psi/kmprt17_hashing.cc
rename to experiment/psi/kmprt17_mp_psi/kmprt17_hashing.cc
index ebd537d..b9c49aa 100644
--- a/psi/legacy/kmprt17_mp_psi/kmprt17_hashing.cc
+++ b/experiment/psi/kmprt17_mp_psi/kmprt17_hashing.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/legacy/kmprt17_mp_psi/kmprt17_hashing.h"
+#include "experiment/psi/kmprt17_mp_psi/kmprt17_hashing.h"
 
 #include <random>
 #include <utility>
diff --git a/psi/legacy/kmprt17_mp_psi/kmprt17_hashing.h b/experiment/psi/kmprt17_mp_psi/kmprt17_hashing.h
similarity index 100%
rename from psi/legacy/kmprt17_mp_psi/kmprt17_hashing.h
rename to experiment/psi/kmprt17_mp_psi/kmprt17_hashing.h
diff --git a/psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi.cc b/experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi.cc
similarity index 97%
rename from psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi.cc
rename to experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi.cc
index 8048881..d8a38ec 100644
--- a/psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi.cc
+++ b/experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi.h"
+#include "experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi.h"
 
 #include <future>
 
+#include "experiment/psi/kmprt17_mp_psi/kmprt17_opprf.h"
 #include "yacl/crypto/hash/hash_utils.h"
 #include "yacl/crypto/rand/rand.h"
 #include "yacl/utils/serialize.h"
 
-#include "psi/legacy/kmprt17_mp_psi/kmprt17_opprf.h"
 #include "psi/utils/communication.h"
 #include "psi/utils/sync.h"
 
diff --git a/psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi.h b/experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi.h
similarity index 100%
rename from psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi.h
rename to experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi.h
diff --git a/psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi_test.cc b/experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi_test.cc
similarity index 98%
rename from psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi_test.cc
rename to experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi_test.cc
index c8c865d..3bae534 100644
--- a/psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi_test.cc
+++ b/experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi.h"
+#include "experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi.h"
 
 #include <random>
 #include <set>
diff --git a/psi/legacy/kmprt17_mp_psi/kmprt17_opprf.cc b/experiment/psi/kmprt17_mp_psi/kmprt17_opprf.cc
similarity index 98%
rename from psi/legacy/kmprt17_mp_psi/kmprt17_opprf.cc
rename to experiment/psi/kmprt17_mp_psi/kmprt17_opprf.cc
index f5c0ae5..28257fa 100644
--- a/psi/legacy/kmprt17_mp_psi/kmprt17_opprf.cc
+++ b/experiment/psi/kmprt17_mp_psi/kmprt17_opprf.cc
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/legacy/kmprt17_mp_psi/kmprt17_opprf.h"
+#include "experiment/psi/kmprt17_mp_psi/kmprt17_opprf.h"
 
 #include <algorithm>
 #include <cmath>
 #include <memory>
 
+#include "experiment/psi/kmprt17_mp_psi/kmprt17_hashing.h"
 #include "yacl/crypto/rand/rand.h"
 #include "yacl/crypto/tools/ro.h"
 #include "yacl/kernel/algorithms/base_ot.h"
@@ -26,8 +27,6 @@
 #include "yacl/link/link.h"
 #include "yacl/utils/serialize.h"
 
-#include "psi/legacy/kmprt17_mp_psi/kmprt17_hashing.h"
-
 namespace psi::psi {
 
 namespace yc = yacl::crypto;
diff --git a/psi/legacy/kmprt17_mp_psi/kmprt17_opprf.h b/experiment/psi/kmprt17_mp_psi/kmprt17_opprf.h
similarity index 100%
rename from psi/legacy/kmprt17_mp_psi/kmprt17_opprf.h
rename to experiment/psi/kmprt17_mp_psi/kmprt17_opprf.h
diff --git a/psi/BUILD.bazel b/psi/BUILD.bazel
index c2914bc..8699466 100644
--- a/psi/BUILD.bazel
+++ b/psi/BUILD.bazel
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("//bazel:psi.bzl", "psi_cc_binary", "psi_cc_library", "psi_cc_test")
+load("//bazel:psi.bzl", "psi_cc_library")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -41,59 +41,18 @@ psi_cc_library(
         "//psi/utils:recovery",
         "//psi/utils:resource_manager",
         "//psi/utils:table_utils",
-        "@com_github_google_perfetto//:perfetto",
-        "@com_google_absl//absl/status",
+        "@abseil-cpp//absl/status",
+        "@perfetto",
         "@yacl//yacl/link",
     ],
 )
 
-psi_cc_library(
-    name = "factory",
-    srcs = ["factory.cc"],
-    hdrs = ["factory.h"],
-    deps = [
-        "//psi/ecdh:receiver",
-        "//psi/ecdh:sender",
-        "//psi/ecdh/ub_psi:client",
-        "//psi/ecdh/ub_psi:server",
-        "//psi/kkrt:receiver",
-        "//psi/kkrt:sender",
-        "//psi/rr22:receiver",
-        "//psi/rr22:sender",
-        "@yacl//yacl/base:exception",
-    ],
-)
-
-psi_cc_library(
-    name = "launch",
-    srcs = ["launch.cc"],
-    hdrs = ["launch.h"],
-    deps = [
-        ":factory",
-        ":trace_categories",
-        "//psi/apsi_wrapper/cli:entry",
-        "//psi/legacy:bucket_psi",
-        "@boost//:algorithm",
-    ],
-)
-
 psi_cc_library(
     name = "trace_categories",
     srcs = ["trace_categories.cc"],
     hdrs = ["trace_categories.h"],
     deps = [
-        "@com_github_google_perfetto//:perfetto",
-    ],
-)
-
-psi_cc_test(
-    name = "psi_test",
-    srcs = ["psi_test.cc"],
-    flaky = True,
-    deps = [
-        ":factory",
-        "//psi/utils:arrow_csv_batch_provider",
-        "@yacl//yacl/utils:scope_guard",
+        "@perfetto",
     ],
 )
 
@@ -101,40 +60,3 @@ psi_cc_library(
     name = "version",
     hdrs = ["version.h"],
 )
-
-psi_cc_library(
-    name = "kuscia_adapter",
-    srcs = [
-        "kuscia_adapter.cc",
-    ],
-    hdrs = [
-        "kuscia_adapter.h",
-    ],
-    deps = [
-        "//psi/proto:entry_cc_proto",
-        "//psi/proto:kuscia_cc_proto",
-        "@com_github_tencent_rapidjson//:rapidjson",
-        "@yacl//yacl/base:exception",
-    ],
-)
-
-psi_cc_test(
-    name = "kuscia_adapter_test",
-    srcs = ["kuscia_adapter_test.cc"],
-    deps = [
-        ":kuscia_adapter",
-    ],
-)
-
-psi_cc_binary(
-    name = "main",
-    srcs = ["main.cc"],
-    deps = [
-        ":kuscia_adapter",
-        ":version",
-        "//psi:launch",
-        "//psi/proto:entry_cc_proto",
-        "//psi/utils:resource_manager",
-        "@com_github_gflags_gflags//:gflags",
-    ],
-)
diff --git a/psi/ecdh/BUILD.bazel b/psi/algorithm/ecdh/BUILD.bazel
similarity index 98%
rename from psi/ecdh/BUILD.bazel
rename to psi/algorithm/ecdh/BUILD.bazel
index 7b91ff3..7cd1a78 100644
--- a/psi/ecdh/BUILD.bazel
+++ b/psi/algorithm/ecdh/BUILD.bazel
@@ -27,7 +27,7 @@ psi_cc_library(
         "//psi/utils:communication",
         "//psi/utils:ec_point_store",
         "//psi/utils:recovery",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@yacl//yacl/link",
         "@yacl//yacl/utils:parallel",
     ],
diff --git a/psi/ecdh/common.h b/psi/algorithm/ecdh/common.h
similarity index 100%
rename from psi/ecdh/common.h
rename to psi/algorithm/ecdh/common.h
diff --git a/psi/ecdh/ecdh_3pc_psi.cc b/psi/algorithm/ecdh/ecdh_3pc_psi.cc
similarity index 99%
rename from psi/ecdh/ecdh_3pc_psi.cc
rename to psi/algorithm/ecdh/ecdh_3pc_psi.cc
index 50a6158..fb11d7f 100644
--- a/psi/ecdh/ecdh_3pc_psi.cc
+++ b/psi/algorithm/ecdh/ecdh_3pc_psi.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/ecdh/ecdh_3pc_psi.h"
+#include "psi/algorithm/ecdh/ecdh_3pc_psi.h"
 
 #include <algorithm>
 #include <future>
@@ -376,4 +376,4 @@ size_t ShuffleEcdh3PcPsi::GetPartnersPsiPeerRank() {
   }
 }
 
-}  // namespace psi::ecdh
\ No newline at end of file
+}  // namespace psi::ecdh
diff --git a/psi/ecdh/ecdh_3pc_psi.h b/psi/algorithm/ecdh/ecdh_3pc_psi.h
similarity index 99%
rename from psi/ecdh/ecdh_3pc_psi.h
rename to psi/algorithm/ecdh/ecdh_3pc_psi.h
index 29899a7..abe8793 100644
--- a/psi/ecdh/ecdh_3pc_psi.h
+++ b/psi/algorithm/ecdh/ecdh_3pc_psi.h
@@ -18,7 +18,7 @@
 #include <string>
 #include <vector>
 
-#include "psi/ecdh/ecdh_psi.h"
+#include "psi/algorithm/ecdh/ecdh_psi.h"
 #include "psi/utils/communication.h"
 
 namespace psi::ecdh {
diff --git a/psi/ecdh/ecdh_3pc_psi_benchmark.cc b/psi/algorithm/ecdh/ecdh_3pc_psi_benchmark.cc
similarity index 98%
rename from psi/ecdh/ecdh_3pc_psi_benchmark.cc
rename to psi/algorithm/ecdh/ecdh_3pc_psi_benchmark.cc
index e66de39..ebed22a 100644
--- a/psi/ecdh/ecdh_3pc_psi_benchmark.cc
+++ b/psi/algorithm/ecdh/ecdh_3pc_psi_benchmark.cc
@@ -19,7 +19,7 @@
 #include "yacl/base/exception.h"
 #include "yacl/link/test_util.h"
 
-#include "psi/ecdh/ecdh_3pc_psi.h"
+#include "psi/algorithm/ecdh/ecdh_3pc_psi.h"
 #include "psi/utils/test_utils.h"
 
 static void BM_Ecdh3PcPsi(benchmark::State& state) {
diff --git a/psi/ecdh/ecdh_3pc_psi_test.cc b/psi/algorithm/ecdh/ecdh_3pc_psi_test.cc
similarity index 99%
rename from psi/ecdh/ecdh_3pc_psi_test.cc
rename to psi/algorithm/ecdh/ecdh_3pc_psi_test.cc
index d5001c6..0248c86 100644
--- a/psi/ecdh/ecdh_3pc_psi_test.cc
+++ b/psi/algorithm/ecdh/ecdh_3pc_psi_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/ecdh/ecdh_3pc_psi.h"
+#include "psi/algorithm/ecdh/ecdh_3pc_psi.h"
 
 #include <future>
 #include <iostream>
diff --git a/psi/ecdh/ecdh_logger.h b/psi/algorithm/ecdh/ecdh_logger.h
similarity index 100%
rename from psi/ecdh/ecdh_logger.h
rename to psi/algorithm/ecdh/ecdh_logger.h
diff --git a/psi/ecdh/ecdh_psi.cc b/psi/algorithm/ecdh/ecdh_psi.cc
similarity index 99%
rename from psi/ecdh/ecdh_psi.cc
rename to psi/algorithm/ecdh/ecdh_psi.cc
index 074260a..20d7278 100644
--- a/psi/ecdh/ecdh_psi.cc
+++ b/psi/algorithm/ecdh/ecdh_psi.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/ecdh/ecdh_psi.h"
+#include "psi/algorithm/ecdh/ecdh_psi.h"
 
 #include <cstdint>
 #include <future>
diff --git a/psi/ecdh/ecdh_psi.h b/psi/algorithm/ecdh/ecdh_psi.h
similarity index 99%
rename from psi/ecdh/ecdh_psi.h
rename to psi/algorithm/ecdh/ecdh_psi.h
index 1988b2c..942500e 100644
--- a/psi/ecdh/ecdh_psi.h
+++ b/psi/algorithm/ecdh/ecdh_psi.h
@@ -24,8 +24,8 @@
 
 #include "yacl/link/link.h"
 
+#include "psi/algorithm/ecdh/ecdh_logger.h"
 #include "psi/cryptor/ecc_cryptor.h"
-#include "psi/ecdh/ecdh_logger.h"
 #include "psi/utils/batch_provider.h"
 #include "psi/utils/communication.h"
 #include "psi/utils/ec_point_store.h"
diff --git a/psi/ecdh/ecdh_psi_benchmark.cc b/psi/algorithm/ecdh/ecdh_psi_benchmark.cc
similarity index 98%
rename from psi/ecdh/ecdh_psi_benchmark.cc
rename to psi/algorithm/ecdh/ecdh_psi_benchmark.cc
index de981f5..4a2e8ed 100644
--- a/psi/ecdh/ecdh_psi_benchmark.cc
+++ b/psi/algorithm/ecdh/ecdh_psi_benchmark.cc
@@ -20,8 +20,8 @@
 #include "yacl/base/exception.h"
 #include "yacl/link/test_util.h"
 
+#include "psi/algorithm/ecdh/ecdh_psi.h"
 #include "psi/cryptor/cryptor_selector.h"
-#include "psi/ecdh/ecdh_psi.h"
 #include "psi/utils/batch_provider.h"
 #include "psi/utils/ec_point_store.h"
 
diff --git a/psi/ecdh/ecdh_psi_test.cc b/psi/algorithm/ecdh/ecdh_psi_test.cc
similarity index 99%
rename from psi/ecdh/ecdh_psi_test.cc
rename to psi/algorithm/ecdh/ecdh_psi_test.cc
index 271e4c9..c69d751 100644
--- a/psi/ecdh/ecdh_psi_test.cc
+++ b/psi/algorithm/ecdh/ecdh_psi_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/ecdh/ecdh_psi.h"
+#include "psi/algorithm/ecdh/ecdh_psi.h"
 
 #include <future>
 #include <iostream>
diff --git a/psi/ecdh/receiver.cc b/psi/algorithm/ecdh/receiver.cc
similarity index 98%
rename from psi/ecdh/receiver.cc
rename to psi/algorithm/ecdh/receiver.cc
index 9592339..dbe9e71 100644
--- a/psi/ecdh/receiver.cc
+++ b/psi/algorithm/ecdh/receiver.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/ecdh/receiver.h"
+#include "psi/algorithm/ecdh/receiver.h"
 
 #include <filesystem>
 
@@ -21,8 +21,8 @@
 #include "yacl/base/exception.h"
 #include "yacl/utils/scope_guard.h"
 
+#include "psi/algorithm/ecdh/common.h"
 #include "psi/cryptor/cryptor_selector.h"
-#include "psi/ecdh/common.h"
 #include "psi/trace_categories.h"
 #include "psi/utils/sync.h"
 
diff --git a/psi/ecdh/receiver.h b/psi/algorithm/ecdh/receiver.h
similarity index 96%
rename from psi/ecdh/receiver.h
rename to psi/algorithm/ecdh/receiver.h
index cdd72ef..938498f 100644
--- a/psi/ecdh/receiver.h
+++ b/psi/algorithm/ecdh/receiver.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 #pragma once
 
-#include "psi/ecdh/ecdh_psi.h"
+#include "psi/algorithm/ecdh/ecdh_psi.h"
 #include "psi/interface.h"
 #include "psi/utils/arrow_csv_batch_provider.h"
 
diff --git a/psi/ecdh/sender.cc b/psi/algorithm/ecdh/sender.cc
similarity index 98%
rename from psi/ecdh/sender.cc
rename to psi/algorithm/ecdh/sender.cc
index 1fb62a2..cae3519 100644
--- a/psi/ecdh/sender.cc
+++ b/psi/algorithm/ecdh/sender.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/ecdh/sender.h"
+#include "psi/algorithm/ecdh/sender.h"
 
 #include <filesystem>
 
@@ -22,8 +22,8 @@
 #include "yacl/base/exception.h"
 #include "yacl/utils/scope_guard.h"
 
+#include "psi/algorithm/ecdh/common.h"
 #include "psi/cryptor/cryptor_selector.h"
-#include "psi/ecdh/common.h"
 #include "psi/trace_categories.h"
 #include "psi/utils/sync.h"
 
diff --git a/psi/ecdh/sender.h b/psi/algorithm/ecdh/sender.h
similarity index 96%
rename from psi/ecdh/sender.h
rename to psi/algorithm/ecdh/sender.h
index e439d78..7050ede 100644
--- a/psi/ecdh/sender.h
+++ b/psi/algorithm/ecdh/sender.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 #pragma once
 
-#include "psi/ecdh/ecdh_psi.h"
+#include "psi/algorithm/ecdh/ecdh_psi.h"
 #include "psi/interface.h"
 #include "psi/utils/arrow_csv_batch_provider.h"
 
diff --git a/psi/ecdh/ub_psi/BUILD.bazel b/psi/algorithm/ecdh/ub_psi/BUILD.bazel
similarity index 90%
rename from psi/ecdh/ub_psi/BUILD.bazel
rename to psi/algorithm/ecdh/ub_psi/BUILD.bazel
index e7aba0d..d12f65a 100644
--- a/psi/ecdh/ub_psi/BUILD.bazel
+++ b/psi/algorithm/ecdh/ub_psi/BUILD.bazel
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("//bazel:psi.bzl", "psi_cc_binary", "psi_cc_library", "psi_cc_test")
+load("//bazel:psi.bzl", "psi_cc_library", "psi_cc_test")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -24,8 +24,8 @@ psi_cc_library(
     linkopts = ["-ldl"],
     deps = [
         "//psi/cryptor:ecc_cryptor",
-        "@com_github_openssl_openssl//:openssl",
-        "@com_google_absl//absl/types:span",
+        "@abseil-cpp//absl/types:span",
+        "@openssl",
         "@yacl//yacl/base:byte_container_view",
         "@yacl//yacl/base:exception",
         "@yacl//yacl/utils:parallel",
@@ -54,8 +54,8 @@ psi_cc_library(
         ":ecdh_oprf",
         "//psi/cryptor:ecc_utils",
         "//psi/cryptor:sm2_cryptor",
-        "@com_github_microsoft_FourQlib//:FourQlib",
-        "@com_google_absl//absl/types:span",
+        "@abseil-cpp//absl/types:span",
+        "@fourqlib//:FourQlib",
         "@yacl//yacl/base:exception",
         "@yacl//yacl/crypto/hash:blake3",
         "@yacl//yacl/crypto/hash:hash_utils",
@@ -83,7 +83,7 @@ psi_cc_library(
         "//psi/utils:communication",
         "//psi/utils:ec_point_store",
         "//psi/utils:ub_psi_cache",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@yacl//yacl/base:exception",
         "@yacl//yacl/crypto/rand",
         "@yacl//yacl/link",
@@ -98,7 +98,7 @@ psi_cc_test(
         ":ecdh_oprf_psi",
         "//psi/utils:batch_provider_impl",
         "//psi/utils:test_utils",
-        "@com_google_absl//absl/time",
+        "@abseil-cpp//absl/time",
         "@yacl//yacl/crypto/rand",
         "@yacl//yacl/crypto/tools:prg",
         "@yacl//yacl/utils:scope_guard",
diff --git a/psi/ecdh/ub_psi/basic_ecdh_oprf.cc b/psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.cc
similarity index 99%
rename from psi/ecdh/ub_psi/basic_ecdh_oprf.cc
rename to psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.cc
index 2612c84..eb52041 100644
--- a/psi/ecdh/ub_psi/basic_ecdh_oprf.cc
+++ b/psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/ecdh/ub_psi/basic_ecdh_oprf.h"
+#include "psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.h"
 
 #include <string>
 #include <vector>
diff --git a/psi/ecdh/ub_psi/basic_ecdh_oprf.h b/psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.h
similarity index 99%
rename from psi/ecdh/ub_psi/basic_ecdh_oprf.h
rename to psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.h
index 62a2cff..6655080 100644
--- a/psi/ecdh/ub_psi/basic_ecdh_oprf.h
+++ b/psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.h
@@ -26,9 +26,9 @@
 #include "yacl/base/exception.h"
 #include "yacl/crypto/hash/hash_interface.h"
 
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf.h"
 #include "psi/cryptor/ecc_cryptor.h"
 #include "psi/cryptor/sm2_cryptor.h"
-#include "psi/ecdh/ub_psi/ecdh_oprf.h"
 
 // 2HashDH Oprf
 // F_k(x) = H2(x, H1(x)^k)
diff --git a/psi/ecdh/ub_psi/basic_ecdh_oprf_test.cc b/psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf_test.cc
similarity index 96%
rename from psi/ecdh/ub_psi/basic_ecdh_oprf_test.cc
rename to psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf_test.cc
index 6152bcd..2c5b542 100644
--- a/psi/ecdh/ub_psi/basic_ecdh_oprf_test.cc
+++ b/psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/ecdh/ub_psi/basic_ecdh_oprf.h"
+#include "psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.h"
 
 #include <future>
 #include <iostream>
@@ -28,7 +28,7 @@
 #include "yacl/crypto/rand/rand.h"
 #include "yacl/crypto/tools/prg.h"
 
-#include "psi/ecdh/ub_psi/ecdh_oprf_selector.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h"
 
 namespace psi::ecdh {
 struct TestParams {
diff --git a/psi/ecdh/ub_psi/client.cc b/psi/algorithm/ecdh/ub_psi/client.cc
similarity index 99%
rename from psi/ecdh/ub_psi/client.cc
rename to psi/algorithm/ecdh/ub_psi/client.cc
index d45aa83..3f170f4 100644
--- a/psi/ecdh/ub_psi/client.cc
+++ b/psi/algorithm/ecdh/ub_psi/client.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/ecdh/ub_psi/client.h"
+#include "psi/algorithm/ecdh/ub_psi/client.h"
 
 #include <fmt/format.h>
 #include <fmt/ranges.h>
diff --git a/psi/ecdh/ub_psi/client.h b/psi/algorithm/ecdh/ub_psi/client.h
similarity index 96%
rename from psi/ecdh/ub_psi/client.h
rename to psi/algorithm/ecdh/ub_psi/client.h
index f10cc34..6001bc1 100644
--- a/psi/ecdh/ub_psi/client.h
+++ b/psi/algorithm/ecdh/ub_psi/client.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 #pragma once
 
-#include "psi/ecdh/ub_psi/ecdh_oprf_psi.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.h"
 #include "psi/interface.h"
 #include "psi/utils/resource_manager.h"
 
diff --git a/psi/ecdh/ub_psi/ecdh_oprf.cc b/psi/algorithm/ecdh/ub_psi/ecdh_oprf.cc
similarity index 98%
rename from psi/ecdh/ub_psi/ecdh_oprf.cc
rename to psi/algorithm/ecdh/ub_psi/ecdh_oprf.cc
index 4f19b73..b218987 100644
--- a/psi/ecdh/ub_psi/ecdh_oprf.cc
+++ b/psi/algorithm/ecdh/ub_psi/ecdh_oprf.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/ecdh/ub_psi/ecdh_oprf.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf.h"
 
 #include <algorithm>
 #include <string>
diff --git a/psi/ecdh/ub_psi/ecdh_oprf.h b/psi/algorithm/ecdh/ub_psi/ecdh_oprf.h
similarity index 100%
rename from psi/ecdh/ub_psi/ecdh_oprf.h
rename to psi/algorithm/ecdh/ub_psi/ecdh_oprf.h
diff --git a/psi/ecdh/ub_psi/ecdh_oprf_psi.cc b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.cc
similarity index 99%
rename from psi/ecdh/ub_psi/ecdh_oprf_psi.cc
rename to psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.cc
index 5c67cfb..b78723b 100644
--- a/psi/ecdh/ub_psi/ecdh_oprf_psi.cc
+++ b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/ecdh/ub_psi/ecdh_oprf_psi.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.h"
 
 #include <fmt/format.h>
 #include <fmt/ranges.h>
@@ -34,8 +34,8 @@
 #include "yacl/crypto/rand/rand.h"
 #include "yacl/utils/parallel.h"
 
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h"
 #include "psi/cryptor/ecc_utils.h"
-#include "psi/ecdh/ub_psi/ecdh_oprf_selector.h"
 #include "psi/utils/communication.h"
 #include "psi/utils/serialize.h"
 
diff --git a/psi/ecdh/ub_psi/ecdh_oprf_psi.h b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.h
similarity index 98%
rename from psi/ecdh/ub_psi/ecdh_oprf_psi.h
rename to psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.h
index 404a534..e6e0aa4 100644
--- a/psi/ecdh/ub_psi/ecdh_oprf_psi.h
+++ b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.h
@@ -26,8 +26,8 @@
 #include "yacl/base/byte_container_view.h"
 #include "yacl/link/link.h"
 
-#include "psi/ecdh/ub_psi/ecdh_oprf.h"
-#include "psi/ecdh/ub_psi/ecdh_oprf_selector.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h"
 #include "psi/utils/batch_provider.h"
 #include "psi/utils/ec_point_store.h"
 #include "psi/utils/ub_psi_cache.h"
diff --git a/psi/ecdh/ub_psi/ecdh_oprf_psi_test.cc b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi_test.cc
similarity index 99%
rename from psi/ecdh/ub_psi/ecdh_oprf_psi_test.cc
rename to psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi_test.cc
index e7c93fe..aaa016e 100644
--- a/psi/ecdh/ub_psi/ecdh_oprf_psi_test.cc
+++ b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/ecdh/ub_psi/ecdh_oprf_psi.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.h"
 
 #include <algorithm>
 #include <future>
@@ -31,7 +31,7 @@
 #include "yacl/link/test_util.h"
 #include "yacl/utils/scope_guard.h"
 
-#include "psi/ecdh/ub_psi/ecdh_oprf_selector.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h"
 #include "psi/utils/arrow_csv_batch_provider.h"
 #include "psi/utils/batch_provider_impl.h"
 #include "psi/utils/ec_point_store.h"
diff --git a/psi/ecdh/ub_psi/ecdh_oprf_selector.cc b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.cc
similarity index 96%
rename from psi/ecdh/ub_psi/ecdh_oprf_selector.cc
rename to psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.cc
index 2d523cf..af1cfcb 100644
--- a/psi/ecdh/ub_psi/ecdh_oprf_selector.cc
+++ b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/ecdh/ub_psi/ecdh_oprf_selector.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h"
 
 #include "yacl/utils/platform_utils.h"
 
-#include "psi/ecdh/ub_psi/basic_ecdh_oprf.h"
-#include "psi/ecdh/ub_psi/ecdh_oprf.h"
+#include "psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf.h"
 
 namespace psi::ecdh {
 
diff --git a/psi/ecdh/ub_psi/ecdh_oprf_selector.h b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h
similarity index 96%
rename from psi/ecdh/ub_psi/ecdh_oprf_selector.h
rename to psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h
index a48be4a..776addb 100644
--- a/psi/ecdh/ub_psi/ecdh_oprf_selector.h
+++ b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h
@@ -16,7 +16,7 @@
 
 #include <memory>
 
-#include "psi/ecdh/ub_psi/ecdh_oprf.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf.h"
 
 namespace psi::ecdh {
 
diff --git a/psi/ecdh/ub_psi/server.cc b/psi/algorithm/ecdh/ub_psi/server.cc
similarity index 99%
rename from psi/ecdh/ub_psi/server.cc
rename to psi/algorithm/ecdh/ub_psi/server.cc
index 0ea7aa1..9b65665 100644
--- a/psi/ecdh/ub_psi/server.cc
+++ b/psi/algorithm/ecdh/ub_psi/server.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/ecdh/ub_psi/server.h"
+#include "psi/algorithm/ecdh/ub_psi/server.h"
 
 #include <spdlog/spdlog.h>
 
diff --git a/psi/ecdh/ub_psi/server.h b/psi/algorithm/ecdh/ub_psi/server.h
similarity index 97%
rename from psi/ecdh/ub_psi/server.h
rename to psi/algorithm/ecdh/ub_psi/server.h
index 7a08c33..05114d9 100644
--- a/psi/ecdh/ub_psi/server.h
+++ b/psi/algorithm/ecdh/ub_psi/server.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 #pragma once
 
-#include "psi/ecdh/ub_psi/ecdh_oprf_psi.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.h"
 #include "psi/interface.h"
 #include "psi/utils/arrow_csv_batch_provider.h"
 #include "psi/utils/join_processor.h"
diff --git a/psi/kkrt/BUILD.bazel b/psi/algorithm/kkrt/BUILD.bazel
similarity index 98%
rename from psi/kkrt/BUILD.bazel
rename to psi/algorithm/kkrt/BUILD.bazel
index 0e5784c..deeecaa 100644
--- a/psi/kkrt/BUILD.bazel
+++ b/psi/algorithm/kkrt/BUILD.bazel
@@ -25,7 +25,7 @@ psi_cc_library(
         "//psi/utils:communication",
         "//psi/utils:cuckoo_index",
         "//psi/utils:serialize",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@yacl//yacl/crypto/hash:hash_utils",
         "@yacl//yacl/crypto/rand",
         "@yacl//yacl/kernel/algorithms:base_ot",
diff --git a/psi/kkrt/common.cc b/psi/algorithm/kkrt/common.cc
similarity index 94%
rename from psi/kkrt/common.cc
rename to psi/algorithm/kkrt/common.cc
index 6eaa87f..5dcf0d4 100644
--- a/psi/kkrt/common.cc
+++ b/psi/algorithm/kkrt/common.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/kkrt/common.h"
+#include "psi/algorithm/kkrt/common.h"
 
 #include "psi/utils/bucket.h"
 
diff --git a/psi/kkrt/common.h b/psi/algorithm/kkrt/common.h
similarity index 100%
rename from psi/kkrt/common.h
rename to psi/algorithm/kkrt/common.h
diff --git a/psi/kkrt/kkrt_psi.cc b/psi/algorithm/kkrt/kkrt_psi.cc
similarity index 99%
rename from psi/kkrt/kkrt_psi.cc
rename to psi/algorithm/kkrt/kkrt_psi.cc
index 1f47a69..4c9376c 100644
--- a/psi/kkrt/kkrt_psi.cc
+++ b/psi/algorithm/kkrt/kkrt_psi.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/kkrt/kkrt_psi.h"
+#include "psi/algorithm/kkrt/kkrt_psi.h"
 
 #include <future>
 #include <numeric>
diff --git a/psi/kkrt/kkrt_psi.h b/psi/algorithm/kkrt/kkrt_psi.h
similarity index 100%
rename from psi/kkrt/kkrt_psi.h
rename to psi/algorithm/kkrt/kkrt_psi.h
diff --git a/psi/kkrt/kkrt_psi_benchmark.cc b/psi/algorithm/kkrt/kkrt_psi_benchmark.cc
similarity index 98%
rename from psi/kkrt/kkrt_psi_benchmark.cc
rename to psi/algorithm/kkrt/kkrt_psi_benchmark.cc
index 60e2f24..0eedb74 100644
--- a/psi/kkrt/kkrt_psi_benchmark.cc
+++ b/psi/algorithm/kkrt/kkrt_psi_benchmark.cc
@@ -20,7 +20,7 @@
 #include "yacl/crypto/hash/hash_utils.h"
 #include "yacl/link/test_util.h"
 
-#include "psi/kkrt/kkrt_psi.h"
+#include "psi/algorithm/kkrt/kkrt_psi.h"
 
 namespace {
 std::vector<uint128_t> CreateRangeItems(size_t begin, size_t size) {
diff --git a/psi/kkrt/kkrt_psi_test.cc b/psi/algorithm/kkrt/kkrt_psi_test.cc
similarity index 99%
rename from psi/kkrt/kkrt_psi_test.cc
rename to psi/algorithm/kkrt/kkrt_psi_test.cc
index 20327e8..dc49f74 100644
--- a/psi/kkrt/kkrt_psi_test.cc
+++ b/psi/algorithm/kkrt/kkrt_psi_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/kkrt/kkrt_psi.h"
+#include "psi/algorithm/kkrt/kkrt_psi.h"
 
 #include <future>
 #include <iostream>
diff --git a/psi/kkrt/receiver.cc b/psi/algorithm/kkrt/receiver.cc
similarity index 97%
rename from psi/kkrt/receiver.cc
rename to psi/algorithm/kkrt/receiver.cc
index aa35457..e91aa3d 100644
--- a/psi/kkrt/receiver.cc
+++ b/psi/algorithm/kkrt/receiver.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/kkrt/receiver.h"
+#include "psi/algorithm/kkrt/receiver.h"
 
 #include "yacl/crypto/hash/hash_utils.h"
 #include "yacl/utils/parallel.h"
 
-#include "psi/kkrt/common.h"
-#include "psi/kkrt/kkrt_psi.h"
+#include "psi/algorithm/kkrt/common.h"
+#include "psi/algorithm/kkrt/kkrt_psi.h"
 #include "psi/legacy/bucket_psi.h"
 #include "psi/prelude.h"
 #include "psi/trace_categories.h"
diff --git a/psi/kkrt/receiver.h b/psi/algorithm/kkrt/receiver.h
similarity index 100%
rename from psi/kkrt/receiver.h
rename to psi/algorithm/kkrt/receiver.h
diff --git a/psi/kkrt/sender.cc b/psi/algorithm/kkrt/sender.cc
similarity index 97%
rename from psi/kkrt/sender.cc
rename to psi/algorithm/kkrt/sender.cc
index 6e70bee..179bb25 100644
--- a/psi/kkrt/sender.cc
+++ b/psi/algorithm/kkrt/sender.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/kkrt/sender.h"
+#include "psi/algorithm/kkrt/sender.h"
 
 #include <cstddef>
 #include <memory>
@@ -21,8 +21,8 @@
 #include "yacl/crypto/hash/hash_utils.h"
 #include "yacl/utils/parallel.h"
 
-#include "psi/kkrt/common.h"
-#include "psi/kkrt/kkrt_psi.h"
+#include "psi/algorithm/kkrt/common.h"
+#include "psi/algorithm/kkrt/kkrt_psi.h"
 #include "psi/legacy/bucket_psi.h"
 #include "psi/prelude.h"
 #include "psi/trace_categories.h"
diff --git a/psi/kkrt/sender.h b/psi/algorithm/kkrt/sender.h
similarity index 100%
rename from psi/kkrt/sender.h
rename to psi/algorithm/kkrt/sender.h
diff --git a/psi/kwpir/BUILD.bazel b/psi/algorithm/kwpir/BUILD.bazel
similarity index 96%
rename from psi/kwpir/BUILD.bazel
rename to psi/algorithm/kwpir/BUILD.bazel
index 6225898..83cb454 100644
--- a/psi/kwpir/BUILD.bazel
+++ b/psi/algorithm/kwpir/BUILD.bazel
@@ -41,6 +41,6 @@ psi_cc_test(
     srcs = ["kw_pir_test.cc"],
     deps = [
         ":kw_pir",
-        "//psi/sealpir:seal_pir",
+        "//psi/algorithm/sealpir:seal_pir",
     ],
 )
diff --git a/psi/kwpir/index_pir.h b/psi/algorithm/kwpir/index_pir.h
similarity index 100%
rename from psi/kwpir/index_pir.h
rename to psi/algorithm/kwpir/index_pir.h
diff --git a/psi/kwpir/kw_pir.cc b/psi/algorithm/kwpir/kw_pir.cc
similarity index 98%
rename from psi/kwpir/kw_pir.cc
rename to psi/algorithm/kwpir/kw_pir.cc
index 10ec68c..f63cd98 100644
--- a/psi/kwpir/kw_pir.cc
+++ b/psi/algorithm/kwpir/kw_pir.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/kwpir/kw_pir.h"
+#include "psi/algorithm/kwpir/kw_pir.h"
 
 #include "yacl/crypto/hash/hash_utils.h"
 
@@ -131,4 +131,4 @@ std::vector<std::vector<uint8_t>> KwPirClient::DecodeReply(
   }
   return ele_vec;
 }
-}  // namespace psi::kwpir
\ No newline at end of file
+}  // namespace psi::kwpir
diff --git a/psi/kwpir/kw_pir.h b/psi/algorithm/kwpir/kw_pir.h
similarity index 97%
rename from psi/kwpir/kw_pir.h
rename to psi/algorithm/kwpir/kw_pir.h
index 7e6817e..84d0777 100644
--- a/psi/kwpir/kw_pir.h
+++ b/psi/algorithm/kwpir/kw_pir.h
@@ -24,7 +24,7 @@
 #include "yacl/base/byte_container_view.h"
 #include "yacl/crypto/rand/rand.h"
 
-#include "psi/kwpir/index_pir.h"
+#include "psi/algorithm/kwpir/index_pir.h"
 #include "psi/utils/cuckoo_index.h"
 
 namespace psi::kwpir {
@@ -88,4 +88,4 @@ class KwPirClient : public KwPir {
  private:
   std::unique_ptr<IndexPirClient> pir_client_;
 };
-}  // namespace psi::kwpir
\ No newline at end of file
+}  // namespace psi::kwpir
diff --git a/psi/kwpir/kw_pir_test.cc b/psi/algorithm/kwpir/kw_pir_test.cc
similarity index 98%
rename from psi/kwpir/kw_pir_test.cc
rename to psi/algorithm/kwpir/kw_pir_test.cc
index b0be22d..07f22fc 100644
--- a/psi/kwpir/kw_pir_test.cc
+++ b/psi/algorithm/kwpir/kw_pir_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/kwpir/kw_pir.h"
+#include "psi/algorithm/kwpir/kw_pir.h"
 
 #include <random>
 #include <utility>
@@ -21,7 +21,7 @@
 #include "spdlog/spdlog.h"
 #include "yacl/crypto/hash/hash_utils.h"
 
-#include "psi/sealpir/seal_pir.h"
+#include "psi/algorithm/sealpir/seal_pir.h"
 
 namespace psi::kwpir {
 
@@ -152,4 +152,4 @@ INSTANTIATE_TEST_SUITE_P(
                     TestParams{(1 << 22) - (1 << 10), 3, 1.3, 16, 64, 128, 4096,
                                2, 0}));
 
-}  // namespace psi::kwpir
\ No newline at end of file
+}  // namespace psi::kwpir
diff --git a/psi/rr22/BUILD.bazel b/psi/algorithm/rr22/BUILD.bazel
similarity index 93%
rename from psi/rr22/BUILD.bazel
rename to psi/algorithm/rr22/BUILD.bazel
index 0afc765..82be1c4 100644
--- a/psi/rr22/BUILD.bazel
+++ b/psi/algorithm/rr22/BUILD.bazel
@@ -46,8 +46,8 @@ psi_cc_library(
     deps = [
         ":davis_meyer_hash",
         ":rr22_utils",
-        "//psi/rr22/okvs:aes_crhash",
-        "//psi/rr22/okvs:baxos",
+        "//psi/algorithm/rr22/okvs:aes_crhash",
+        "//psi/algorithm/rr22/okvs:baxos",
         "@yacl//yacl/base:buffer",
         "@yacl//yacl/crypto/rand",
         "@yacl//yacl/crypto/tools:prg",
@@ -81,11 +81,11 @@ psi_cc_library(
     hdrs = ["rr22_utils.h"],
     deps = [
         ":sparsehash_config",
-        "//psi/rr22/okvs:galois128",
-        "//psi/rr22/okvs:simple_index",
+        "//psi/algorithm/rr22/okvs:galois128",
+        "//psi/algorithm/rr22/okvs:simple_index",
         "//psi/utils:bucket",
-        "@com_github_ridiculousfish_libdivide//:libdivide",
-        "@com_github_sparsehash_sparsehash//:sparsehash",
+        "@libdivide",
+        "@sparsehash",
         "@yacl//yacl/base:buffer",
         "@yacl//yacl/base:int128",
         "@yacl//yacl/link",
diff --git a/psi/rr22/common.cc b/psi/algorithm/rr22/common.cc
similarity index 96%
rename from psi/rr22/common.cc
rename to psi/algorithm/rr22/common.cc
index 80a4349..8f7fff6 100644
--- a/psi/rr22/common.cc
+++ b/psi/algorithm/rr22/common.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/common.h"
+#include "psi/algorithm/rr22/common.h"
 
 #include "omp.h"
 
diff --git a/psi/rr22/common.h b/psi/algorithm/rr22/common.h
similarity index 91%
rename from psi/rr22/common.h
rename to psi/algorithm/rr22/common.h
index 2af8dd8..fdbad4c 100644
--- a/psi/rr22/common.h
+++ b/psi/algorithm/rr22/common.h
@@ -15,8 +15,8 @@
 
 #include <cstdint>
 
-#include "psi/rr22/rr22_oprf.h"
-#include "psi/rr22/rr22_psi.h"
+#include "psi/algorithm/rr22/rr22_oprf.h"
+#include "psi/algorithm/rr22/rr22_psi.h"
 #include "psi/utils/recovery.h"
 
 #include "psi/proto/psi_v2.pb.h"
diff --git a/psi/rr22/davis_meyer_hash.cc b/psi/algorithm/rr22/davis_meyer_hash.cc
similarity index 98%
rename from psi/rr22/davis_meyer_hash.cc
rename to psi/algorithm/rr22/davis_meyer_hash.cc
index 919a894..47d78e1 100644
--- a/psi/rr22/davis_meyer_hash.cc
+++ b/psi/algorithm/rr22/davis_meyer_hash.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/davis_meyer_hash.h"
+#include "psi/algorithm/rr22/davis_meyer_hash.h"
 
 #include "yacl/crypto/aes/aes_opt.h"
 #include "yacl/crypto/block_cipher/symmetric_crypto.h"
diff --git a/psi/rr22/davis_meyer_hash.h b/psi/algorithm/rr22/davis_meyer_hash.h
similarity index 100%
rename from psi/rr22/davis_meyer_hash.h
rename to psi/algorithm/rr22/davis_meyer_hash.h
diff --git a/psi/rr22/davis_meyer_hash_test.cc b/psi/algorithm/rr22/davis_meyer_hash_test.cc
similarity index 96%
rename from psi/rr22/davis_meyer_hash_test.cc
rename to psi/algorithm/rr22/davis_meyer_hash_test.cc
index f601501..d34df6d 100644
--- a/psi/rr22/davis_meyer_hash_test.cc
+++ b/psi/algorithm/rr22/davis_meyer_hash_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/davis_meyer_hash.h"
+#include "psi/algorithm/rr22/davis_meyer_hash.h"
 
 #include "gtest/gtest.h"
 #include "spdlog/spdlog.h"
diff --git a/psi/rr22/okvs/BUILD.bazel b/psi/algorithm/rr22/okvs/BUILD.bazel
similarity index 88%
rename from psi/rr22/okvs/BUILD.bazel
rename to psi/algorithm/rr22/okvs/BUILD.bazel
index d8f7bbe..2598953 100644
--- a/psi/rr22/okvs/BUILD.bazel
+++ b/psi/algorithm/rr22/okvs/BUILD.bazel
@@ -35,7 +35,7 @@ psi_cc_test(
     srcs = ["baxos_test.cc"],
     deps = [
         ":baxos",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@yacl//yacl/crypto/rand",
         "@yacl//yacl/crypto/tools:prg",
     ],
@@ -60,7 +60,7 @@ psi_cc_test(
     srcs = ["paxos_test.cc"],
     deps = [
         ":paxos",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@yacl//yacl/crypto/rand",
         "@yacl//yacl/crypto/tools:prg",
     ],
@@ -77,8 +77,8 @@ psi_cc_library(
     deps = [
         ":aes_crhash",
         ":galois128",
-        "@com_github_ridiculousfish_libdivide//:libdivide",
-        "@com_google_absl//absl/types:span",
+        "@abseil-cpp//absl/types:span",
+        "@libdivide",
         "@yacl//yacl/math:gadget",
         "@yacl//yacl/utils:platform_utils",
     ],
@@ -99,7 +99,7 @@ psi_cc_library(
     hdrs = ["paxos_utils.h"],
     deps = [
         ":galois128",
-        "@com_google_absl//absl/types:span",
+        "@abseil-cpp//absl/types:span",
         "@yacl//yacl/crypto/tools:prg",
     ],
 )
@@ -120,7 +120,7 @@ psi_cc_test(
     deps = [
         ":aes_crhash",
         ":galois128",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@yacl//yacl/crypto/tools:prg",
     ],
 )
@@ -136,7 +136,7 @@ psi_cc_library(
         "//conditions:default": [],
     }),
     deps = [
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@yacl//yacl/base:block",
         "@yacl//yacl/base:int128",
         "@yacl//yacl/link",
@@ -149,7 +149,7 @@ psi_cc_test(
     srcs = ["galois128_test.cc"],
     deps = [
         ":galois128",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@yacl//yacl/crypto/rand",
         "@yacl//yacl/crypto/tools:prg",
     ],
@@ -161,7 +161,7 @@ psi_cc_library(
     hdrs = ["dense_mtx.h"],
     deps = [
         ":galois128",
-        "@com_google_absl//absl/types:span",
+        "@abseil-cpp//absl/types:span",
         "@yacl//yacl/crypto/tools:prg",
     ],
 )
@@ -171,8 +171,8 @@ psi_cc_library(
     srcs = ["simple_index.cc"],
     hdrs = ["simple_index.h"],
     deps = [
-        "@boost//:math",
-        "@boost//:multiprecision",
+        "@boost.math//:boost.math",
+        "@boost.multiprecision//:boost.multiprecision",
         "@yacl//yacl/base:exception",
     ],
 )
diff --git a/psi/rr22/okvs/aes_crhash.cc b/psi/algorithm/rr22/okvs/aes_crhash.cc
similarity index 98%
rename from psi/rr22/okvs/aes_crhash.cc
rename to psi/algorithm/rr22/okvs/aes_crhash.cc
index f425046..68f99f5 100644
--- a/psi/rr22/okvs/aes_crhash.cc
+++ b/psi/algorithm/rr22/okvs/aes_crhash.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/okvs/aes_crhash.h"
+#include "psi/algorithm/rr22/okvs/aes_crhash.h"
 
 #include <vector>
 
diff --git a/psi/rr22/okvs/aes_crhash.h b/psi/algorithm/rr22/okvs/aes_crhash.h
similarity index 100%
rename from psi/rr22/okvs/aes_crhash.h
rename to psi/algorithm/rr22/okvs/aes_crhash.h
diff --git a/psi/rr22/okvs/aes_crhash_test.cc b/psi/algorithm/rr22/okvs/aes_crhash_test.cc
similarity index 95%
rename from psi/rr22/okvs/aes_crhash_test.cc
rename to psi/algorithm/rr22/okvs/aes_crhash_test.cc
index f42963f..b13808a 100644
--- a/psi/rr22/okvs/aes_crhash_test.cc
+++ b/psi/algorithm/rr22/okvs/aes_crhash_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/okvs/aes_crhash.h"
+#include "psi/algorithm/rr22/okvs/aes_crhash.h"
 
 #include <ostream>
 #include <vector>
@@ -22,7 +22,7 @@
 #include "spdlog/spdlog.h"
 #include "yacl/crypto/tools/prg.h"
 
-#include "psi/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
 
 namespace psi::rr22::okvs {
 
diff --git a/psi/rr22/okvs/baxos.cc b/psi/algorithm/rr22/okvs/baxos.cc
similarity index 99%
rename from psi/rr22/okvs/baxos.cc
rename to psi/algorithm/rr22/okvs/baxos.cc
index c2ff514..7c7bca0 100644
--- a/psi/rr22/okvs/baxos.cc
+++ b/psi/algorithm/rr22/okvs/baxos.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/okvs/baxos.h"
+#include "psi/algorithm/rr22/okvs/baxos.h"
 
 #include <algorithm>
 #include <array>
@@ -22,7 +22,7 @@
 
 #include "spdlog/spdlog.h"
 
-#include "psi/rr22/okvs/simple_index.h"
+#include "psi/algorithm/rr22/okvs/simple_index.h"
 
 namespace psi::rr22::okvs {
 
diff --git a/psi/rr22/okvs/baxos.h b/psi/algorithm/rr22/okvs/baxos.h
similarity index 96%
rename from psi/rr22/okvs/baxos.h
rename to psi/algorithm/rr22/okvs/baxos.h
index 02d8cdd..e89d27c 100644
--- a/psi/rr22/okvs/baxos.h
+++ b/psi/algorithm/rr22/okvs/baxos.h
@@ -20,10 +20,10 @@
 
 #include "absl/types/span.h"
 
-#include "psi/rr22/okvs/dense_mtx.h"
-#include "psi/rr22/okvs/galois128.h"
-#include "psi/rr22/okvs/paxos.h"
-#include "psi/rr22/okvs/paxos_utils.h"
+#include "psi/algorithm/rr22/okvs/dense_mtx.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/paxos.h"
+#include "psi/algorithm/rr22/okvs/paxos_utils.h"
 
 namespace psi::rr22::okvs {
 
diff --git a/psi/rr22/okvs/baxos_test.cc b/psi/algorithm/rr22/okvs/baxos_test.cc
similarity index 98%
rename from psi/rr22/okvs/baxos_test.cc
rename to psi/algorithm/rr22/okvs/baxos_test.cc
index 573f8af..14b04b1 100644
--- a/psi/rr22/okvs/baxos_test.cc
+++ b/psi/algorithm/rr22/okvs/baxos_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/okvs/baxos.h"
+#include "psi/algorithm/rr22/okvs/baxos.h"
 
 #include <ostream>
 #include <vector>
diff --git a/psi/rr22/okvs/dense_mtx.cc b/psi/algorithm/rr22/okvs/dense_mtx.cc
similarity index 98%
rename from psi/rr22/okvs/dense_mtx.cc
rename to psi/algorithm/rr22/okvs/dense_mtx.cc
index 6735c43..14bd158 100644
--- a/psi/rr22/okvs/dense_mtx.cc
+++ b/psi/algorithm/rr22/okvs/dense_mtx.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/okvs/dense_mtx.h"
+#include "psi/algorithm/rr22/okvs/dense_mtx.h"
 
 #include <utility>
 
diff --git a/psi/rr22/okvs/dense_mtx.h b/psi/algorithm/rr22/okvs/dense_mtx.h
similarity index 99%
rename from psi/rr22/okvs/dense_mtx.h
rename to psi/algorithm/rr22/okvs/dense_mtx.h
index 4a2c21e..00eacb8 100644
--- a/psi/rr22/okvs/dense_mtx.h
+++ b/psi/algorithm/rr22/okvs/dense_mtx.h
@@ -19,7 +19,7 @@
 
 #include "yacl/base/exception.h"
 
-#include "psi/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
 
 namespace psi::rr22::okvs {
 
diff --git a/psi/rr22/okvs/galois128.cc b/psi/algorithm/rr22/okvs/galois128.cc
similarity index 98%
rename from psi/rr22/okvs/galois128.cc
rename to psi/algorithm/rr22/okvs/galois128.cc
index 181f297..effbd60 100644
--- a/psi/rr22/okvs/galois128.cc
+++ b/psi/algorithm/rr22/okvs/galois128.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
 
 #include <utility>
 
@@ -21,7 +21,7 @@
 #include "yacl/utils/platform_utils.h"
 
 #ifdef __x86_64__
-#include "cpu_features/cpuinfo_x86.h"
+#include "cpuinfo_x86.h"
 #endif
 
 namespace psi::rr22::okvs {
diff --git a/psi/rr22/okvs/galois128.h b/psi/algorithm/rr22/okvs/galois128.h
similarity index 100%
rename from psi/rr22/okvs/galois128.h
rename to psi/algorithm/rr22/okvs/galois128.h
diff --git a/psi/rr22/okvs/galois128_test.cc b/psi/algorithm/rr22/okvs/galois128_test.cc
similarity index 97%
rename from psi/rr22/okvs/galois128_test.cc
rename to psi/algorithm/rr22/okvs/galois128_test.cc
index 04e78a9..e710b49 100644
--- a/psi/rr22/okvs/galois128_test.cc
+++ b/psi/algorithm/rr22/okvs/galois128_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
 
 #include <sstream>
 
diff --git a/psi/rr22/okvs/paxos.cc b/psi/algorithm/rr22/okvs/paxos.cc
similarity index 99%
rename from psi/rr22/okvs/paxos.cc
rename to psi/algorithm/rr22/okvs/paxos.cc
index a01907d..60302d6 100644
--- a/psi/rr22/okvs/paxos.cc
+++ b/psi/algorithm/rr22/okvs/paxos.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/okvs/paxos.h"
+#include "psi/algorithm/rr22/okvs/paxos.h"
 
 #include <cmath>
 #include <functional>
diff --git a/psi/rr22/okvs/paxos.h b/psi/algorithm/rr22/okvs/paxos.h
similarity index 98%
rename from psi/rr22/okvs/paxos.h
rename to psi/algorithm/rr22/okvs/paxos.h
index 2ec657f..43e5e28 100644
--- a/psi/rr22/okvs/paxos.h
+++ b/psi/algorithm/rr22/okvs/paxos.h
@@ -22,10 +22,10 @@
 #include "libdivide.h"
 #include "yacl/utils/platform_utils.h"
 
-#include "psi/rr22/okvs/dense_mtx.h"
-#include "psi/rr22/okvs/galois128.h"
-#include "psi/rr22/okvs/paxos_hash.h"
-#include "psi/rr22/okvs/paxos_utils.h"
+#include "psi/algorithm/rr22/okvs/dense_mtx.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/paxos_hash.h"
+#include "psi/algorithm/rr22/okvs/paxos_utils.h"
 
 namespace psi::rr22::okvs {
 
diff --git a/psi/rr22/okvs/paxos_hash.cc b/psi/algorithm/rr22/okvs/paxos_hash.cc
similarity index 99%
rename from psi/rr22/okvs/paxos_hash.cc
rename to psi/algorithm/rr22/okvs/paxos_hash.cc
index 975963f..887fff5 100644
--- a/psi/rr22/okvs/paxos_hash.cc
+++ b/psi/algorithm/rr22/okvs/paxos_hash.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/okvs/paxos_hash.h"
+#include "psi/algorithm/rr22/okvs/paxos_hash.h"
 
 #include <algorithm>
 
diff --git a/psi/rr22/okvs/paxos_hash.h b/psi/algorithm/rr22/okvs/paxos_hash.h
similarity index 98%
rename from psi/rr22/okvs/paxos_hash.h
rename to psi/algorithm/rr22/okvs/paxos_hash.h
index 79d4b16..4d60bfd 100644
--- a/psi/rr22/okvs/paxos_hash.h
+++ b/psi/algorithm/rr22/okvs/paxos_hash.h
@@ -23,8 +23,8 @@
 #include "yacl/math/gadget.h"
 #include "yacl/utils/platform_utils.h"
 
-#include "psi/rr22/okvs/aes_crhash.h"
-#include "psi/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/aes_crhash.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
 
 namespace psi::rr22::okvs {
 
diff --git a/psi/rr22/okvs/paxos_hash_test.cc b/psi/algorithm/rr22/okvs/paxos_hash_test.cc
similarity index 97%
rename from psi/rr22/okvs/paxos_hash_test.cc
rename to psi/algorithm/rr22/okvs/paxos_hash_test.cc
index b109cb8..ea4bd74 100644
--- a/psi/rr22/okvs/paxos_hash_test.cc
+++ b/psi/algorithm/rr22/okvs/paxos_hash_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/okvs/paxos_hash.h"
+#include "psi/algorithm/rr22/okvs/paxos_hash.h"
 
 #include "gtest/gtest.h"
 #include "spdlog/spdlog.h"
diff --git a/psi/rr22/okvs/paxos_test.cc b/psi/algorithm/rr22/okvs/paxos_test.cc
similarity index 98%
rename from psi/rr22/okvs/paxos_test.cc
rename to psi/algorithm/rr22/okvs/paxos_test.cc
index 623c966..d892652 100644
--- a/psi/rr22/okvs/paxos_test.cc
+++ b/psi/algorithm/rr22/okvs/paxos_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/okvs/paxos.h"
+#include "psi/algorithm/rr22/okvs/paxos.h"
 
 #include "absl/strings/escaping.h"
 #include "gtest/gtest.h"
diff --git a/psi/rr22/okvs/paxos_utils.cc b/psi/algorithm/rr22/okvs/paxos_utils.cc
similarity index 92%
rename from psi/rr22/okvs/paxos_utils.cc
rename to psi/algorithm/rr22/okvs/paxos_utils.cc
index 4eede9e..2127b83 100644
--- a/psi/rr22/okvs/paxos_utils.cc
+++ b/psi/algorithm/rr22/okvs/paxos_utils.cc
@@ -12,6 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/okvs/paxos_utils.h"
+#include "psi/algorithm/rr22/okvs/paxos_utils.h"
 
 namespace psi::rr22::okvs {}
diff --git a/psi/rr22/okvs/paxos_utils.h b/psi/algorithm/rr22/okvs/paxos_utils.h
similarity index 99%
rename from psi/rr22/okvs/paxos_utils.h
rename to psi/algorithm/rr22/okvs/paxos_utils.h
index 9bd7112..654a30a 100644
--- a/psi/rr22/okvs/paxos_utils.h
+++ b/psi/algorithm/rr22/okvs/paxos_utils.h
@@ -22,7 +22,7 @@
 #include "absl/types/span.h"
 #include "yacl/crypto/tools/prg.h"
 
-#include "psi/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
 
 namespace psi::rr22::okvs {
 
diff --git a/psi/rr22/okvs/simple_index.cc b/psi/algorithm/rr22/okvs/simple_index.cc
similarity index 99%
rename from psi/rr22/okvs/simple_index.cc
rename to psi/algorithm/rr22/okvs/simple_index.cc
index 15883d3..58c4a51 100644
--- a/psi/rr22/okvs/simple_index.cc
+++ b/psi/algorithm/rr22/okvs/simple_index.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/okvs/simple_index.h"
+#include "psi/algorithm/rr22/okvs/simple_index.h"
 
 #include <algorithm>
 #include <cmath>
diff --git a/psi/rr22/okvs/simple_index.h b/psi/algorithm/rr22/okvs/simple_index.h
similarity index 100%
rename from psi/rr22/okvs/simple_index.h
rename to psi/algorithm/rr22/okvs/simple_index.h
diff --git a/psi/rr22/receiver.cc b/psi/algorithm/rr22/receiver.cc
similarity index 96%
rename from psi/rr22/receiver.cc
rename to psi/algorithm/rr22/receiver.cc
index b7e5285..4c235cc 100644
--- a/psi/rr22/receiver.cc
+++ b/psi/algorithm/rr22/receiver.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/receiver.h"
+#include "psi/algorithm/rr22/receiver.h"
 
 #include <cstdint>
 #include <functional>
@@ -24,11 +24,11 @@
 #include "yacl/crypto/rand/rand.h"
 #include "yacl/utils/parallel.h"
 
+#include "psi/algorithm/rr22/common.h"
+#include "psi/algorithm/rr22/rr22_psi.h"
+#include "psi/algorithm/rr22/rr22_utils.h"
 #include "psi/legacy/bucket_psi.h"
 #include "psi/prelude.h"
-#include "psi/rr22/common.h"
-#include "psi/rr22/rr22_psi.h"
-#include "psi/rr22/rr22_utils.h"
 #include "psi/trace_categories.h"
 #include "psi/utils/bucket.h"
 #include "psi/utils/serialize.h"
diff --git a/psi/rr22/receiver.h b/psi/algorithm/rr22/receiver.h
similarity index 96%
rename from psi/rr22/receiver.h
rename to psi/algorithm/rr22/receiver.h
index b9efa0f..ab06fbe 100644
--- a/psi/rr22/receiver.h
+++ b/psi/algorithm/rr22/receiver.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 #pragma once
 
+#include "psi/algorithm/rr22/rr22_psi.h"
 #include "psi/interface.h"
-#include "psi/rr22/rr22_psi.h"
 #include "psi/utils/hash_bucket_cache.h"
 
 #include "psi/proto/psi_v2.pb.h"
diff --git a/psi/rr22/rr22_oprf.cc b/psi/algorithm/rr22/rr22_oprf.cc
similarity index 99%
rename from psi/rr22/rr22_oprf.cc
rename to psi/algorithm/rr22/rr22_oprf.cc
index 49645db..9812812 100644
--- a/psi/rr22/rr22_oprf.cc
+++ b/psi/algorithm/rr22/rr22_oprf.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/rr22_oprf.h"
+#include "psi/algorithm/rr22/rr22_oprf.h"
 
 #include <algorithm>
 #include <cstdint>
@@ -29,9 +29,9 @@
 #include "yacl/math/galois_field/gf_intrinsic.h"
 #include "yacl/utils/parallel.h"
 
-#include "psi/rr22/davis_meyer_hash.h"
-#include "psi/rr22/okvs/galois128.h"
-#include "psi/rr22/rr22_utils.h"
+#include "psi/algorithm/rr22/davis_meyer_hash.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/rr22_utils.h"
 
 namespace psi::rr22 {
 
diff --git a/psi/rr22/rr22_oprf.h b/psi/algorithm/rr22/rr22_oprf.h
similarity index 99%
rename from psi/rr22/rr22_oprf.h
rename to psi/algorithm/rr22/rr22_oprf.h
index db436dc..ecdc4f9 100644
--- a/psi/rr22/rr22_oprf.h
+++ b/psi/algorithm/rr22/rr22_oprf.h
@@ -22,7 +22,7 @@
 #include "yacl/kernel/algorithms/silent_vole.h"
 #include "yacl/link/context.h"
 
-#include "psi/rr22/okvs/baxos.h"
+#include "psi/algorithm/rr22/okvs/baxos.h"
 
 // Reference:
 // Blazing Fast PSI from Improved OKVS and Subfield VOLE
diff --git a/psi/rr22/rr22_oprf_test.cc b/psi/algorithm/rr22/rr22_oprf_test.cc
similarity index 97%
rename from psi/rr22/rr22_oprf_test.cc
rename to psi/algorithm/rr22/rr22_oprf_test.cc
index 0da6774..798b970 100644
--- a/psi/rr22/rr22_oprf_test.cc
+++ b/psi/algorithm/rr22/rr22_oprf_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/rr22_oprf.h"
+#include "psi/algorithm/rr22/rr22_oprf.h"
 
 #include <future>
 #include <vector>
@@ -21,7 +21,7 @@
 #include "yacl/crypto/tools/prg.h"
 #include "yacl/link/test_util.h"
 
-#include "psi/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
 
 namespace psi::rr22 {
 
diff --git a/psi/rr22/rr22_psi.cc b/psi/algorithm/rr22/rr22_psi.cc
similarity index 96%
rename from psi/rr22/rr22_psi.cc
rename to psi/algorithm/rr22/rr22_psi.cc
index d33f40e..9553435 100644
--- a/psi/rr22/rr22_psi.cc
+++ b/psi/algorithm/rr22/rr22_psi.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/rr22_psi.h"
+#include "psi/algorithm/rr22/rr22_psi.h"
 
 #include <algorithm>
 #include <cmath>
@@ -28,9 +28,9 @@
 #include "yacl/base/byte_container_view.h"
 #include "yacl/utils/parallel.h"
 
-#include "psi/rr22/okvs/galois128.h"
-#include "psi/rr22/rr22_oprf.h"
-#include "psi/rr22/rr22_utils.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/rr22_oprf.h"
+#include "psi/algorithm/rr22/rr22_utils.h"
 #include "psi/utils/bucket.h"
 #include "psi/utils/sync.h"
 
diff --git a/psi/rr22/rr22_psi.h b/psi/algorithm/rr22/rr22_psi.h
similarity index 99%
rename from psi/rr22/rr22_psi.h
rename to psi/algorithm/rr22/rr22_psi.h
index 6375bac..d940ad7 100644
--- a/psi/rr22/rr22_psi.h
+++ b/psi/algorithm/rr22/rr22_psi.h
@@ -31,7 +31,7 @@
 #include "yacl/base/int128.h"
 #include "yacl/link/context.h"
 
-#include "psi/rr22/rr22_oprf.h"
+#include "psi/algorithm/rr22/rr22_oprf.h"
 #include "psi/utils/bucket.h"
 #include "psi/utils/hash_bucket_cache.h"
 
diff --git a/psi/rr22/rr22_psi_benchmark.cc b/psi/algorithm/rr22/rr22_psi_benchmark.cc
similarity index 98%
rename from psi/rr22/rr22_psi_benchmark.cc
rename to psi/algorithm/rr22/rr22_psi_benchmark.cc
index 4e633de..5b5c493 100644
--- a/psi/rr22/rr22_psi_benchmark.cc
+++ b/psi/algorithm/rr22/rr22_psi_benchmark.cc
@@ -27,9 +27,9 @@
 #include "yacl/link/context.h"
 #include "yacl/link/test_util.h"
 
-#include "psi/rr22/rr22_oprf.h"
-#include "psi/rr22/rr22_psi.h"
-#include "psi/rr22/rr22_utils.h"
+#include "psi/algorithm/rr22/rr22_oprf.h"
+#include "psi/algorithm/rr22/rr22_psi.h"
+#include "psi/algorithm/rr22/rr22_utils.h"
 
 namespace {
 
diff --git a/psi/rr22/rr22_psi_test.cc b/psi/algorithm/rr22/rr22_psi_test.cc
similarity index 98%
rename from psi/rr22/rr22_psi_test.cc
rename to psi/algorithm/rr22/rr22_psi_test.cc
index b2e12a6..c28fb3a 100644
--- a/psi/rr22/rr22_psi_test.cc
+++ b/psi/algorithm/rr22/rr22_psi_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/rr22_psi.h"
+#include "psi/algorithm/rr22/rr22_psi.h"
 
 #include <cstdint>
 #include <future>
@@ -28,7 +28,7 @@
 #include "yacl/crypto/tools/prg.h"
 #include "yacl/link/test_util.h"
 
-#include "psi/rr22/rr22_utils.h"
+#include "psi/algorithm/rr22/rr22_utils.h"
 #include "psi/utils/hash_bucket_cache.h"
 
 namespace psi::rr22 {
diff --git a/psi/rr22/rr22_utils.cc b/psi/algorithm/rr22/rr22_utils.cc
similarity index 98%
rename from psi/rr22/rr22_utils.cc
rename to psi/algorithm/rr22/rr22_utils.cc
index 6528106..839ac2b 100644
--- a/psi/rr22/rr22_utils.cc
+++ b/psi/algorithm/rr22/rr22_utils.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/rr22_utils.h"
+#include "psi/algorithm/rr22/rr22_utils.h"
 
 #include <algorithm>
 #include <array>
@@ -29,8 +29,8 @@
 #include "sparsehash/dense_hash_map"
 #include "yacl/utils/parallel.h"
 
-#include "psi/rr22/okvs/galois128.h"
-#include "psi/rr22/okvs/simple_index.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/simple_index.h"
 #include "psi/utils/serialize.h"
 
 namespace psi::rr22 {
diff --git a/psi/rr22/rr22_utils.h b/psi/algorithm/rr22/rr22_utils.h
similarity index 100%
rename from psi/rr22/rr22_utils.h
rename to psi/algorithm/rr22/rr22_utils.h
diff --git a/psi/rr22/sender.cc b/psi/algorithm/rr22/sender.cc
similarity index 96%
rename from psi/rr22/sender.cc
rename to psi/algorithm/rr22/sender.cc
index c40147d..b69f6ce 100644
--- a/psi/rr22/sender.cc
+++ b/psi/algorithm/rr22/sender.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/rr22/sender.h"
+#include "psi/algorithm/rr22/sender.h"
 
 #include <memory>
 #include <mutex>
@@ -20,10 +20,10 @@
 #include "yacl/crypto/hash/hash_utils.h"
 #include "yacl/utils/parallel.h"
 
+#include "psi/algorithm/rr22/common.h"
+#include "psi/algorithm/rr22/rr22_psi.h"
+#include "psi/algorithm/rr22/rr22_utils.h"
 #include "psi/legacy/bucket_psi.h"
-#include "psi/rr22/common.h"
-#include "psi/rr22/rr22_psi.h"
-#include "psi/rr22/rr22_utils.h"
 #include "psi/trace_categories.h"
 #include "psi/utils/bucket.h"
 #include "psi/utils/sync.h"
diff --git a/psi/rr22/sender.h b/psi/algorithm/rr22/sender.h
similarity index 96%
rename from psi/rr22/sender.h
rename to psi/algorithm/rr22/sender.h
index 6fbb28d..1b3f484 100644
--- a/psi/rr22/sender.h
+++ b/psi/algorithm/rr22/sender.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 #pragma once
 
+#include "psi/algorithm/rr22/rr22_psi.h"
 #include "psi/interface.h"
-#include "psi/rr22/rr22_psi.h"
 #include "psi/utils/hash_bucket_cache.h"
 
 #include "psi/proto/psi_v2.pb.h"
diff --git a/psi/rr22/sparseconfig.h b/psi/algorithm/rr22/sparseconfig.h
similarity index 100%
rename from psi/rr22/sparseconfig.h
rename to psi/algorithm/rr22/sparseconfig.h
diff --git a/psi/sealpir/BUILD.bazel b/psi/algorithm/sealpir/BUILD.bazel
similarity index 85%
rename from psi/sealpir/BUILD.bazel
rename to psi/algorithm/sealpir/BUILD.bazel
index a48675e..b195699 100644
--- a/psi/sealpir/BUILD.bazel
+++ b/psi/algorithm/sealpir/BUILD.bazel
@@ -25,7 +25,7 @@ psi_cc_library(
         "-lm",
     ],
     deps = [
-        "@com_github_microsoft_seal//:seal",
+        "@seal",
         "@yacl//yacl/base:exception",
     ],
 )
@@ -40,10 +40,10 @@ psi_cc_library(
     ],
     deps = [
         ":seal_pir_utils",
-        "//psi/kwpir:kw_pir",
-        "//psi/sealpir:serializable_cc_proto",
-        "@com_github_microsoft_seal//:seal",
-        "@com_github_openssl_openssl//:openssl",
+        ":serializable_cc_proto",
+        "//psi/algorithm/kwpir:kw_pir",
+        "@openssl",
+        "@seal",
         "@yacl//yacl/base:byte_container_view",
         "@yacl//yacl/base:exception",
         "@yacl//yacl/link",
@@ -66,6 +66,6 @@ psi_cc_test(
     srcs = ["seal_pir_test.cc"],
     deps = [
         ":seal_pir",
-        "@com_github_microsoft_seal//:seal",
+        "@seal",
     ],
 )
diff --git a/psi/sealpir/README.md b/psi/algorithm/sealpir/README.md
similarity index 100%
rename from psi/sealpir/README.md
rename to psi/algorithm/sealpir/README.md
diff --git a/psi/sealpir/seal_pir.cc b/psi/algorithm/sealpir/seal_pir.cc
similarity index 99%
rename from psi/sealpir/seal_pir.cc
rename to psi/algorithm/sealpir/seal_pir.cc
index 3c6a062..3269e76 100644
--- a/psi/sealpir/seal_pir.cc
+++ b/psi/algorithm/sealpir/seal_pir.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/sealpir/seal_pir.h"
+#include "psi/algorithm/sealpir/seal_pir.h"
 
 #include <memory.h>
 
@@ -1024,4 +1024,4 @@ Ciphertext SealPirClient::GetOne() {
   return ct;
 }
 
-}  // namespace psi::sealpir
\ No newline at end of file
+}  // namespace psi::sealpir
diff --git a/psi/sealpir/seal_pir.h b/psi/algorithm/sealpir/seal_pir.h
similarity index 97%
rename from psi/sealpir/seal_pir.h
rename to psi/algorithm/sealpir/seal_pir.h
index 7ff2a96..eb18f29 100644
--- a/psi/sealpir/seal_pir.h
+++ b/psi/algorithm/sealpir/seal_pir.h
@@ -23,10 +23,10 @@
 #include "seal/util/polyarithsmallmod.h"
 #include "yacl/base/byte_container_view.h"
 
-#include "psi/kwpir/index_pir.h"
-#include "psi/sealpir/seal_pir_utils.h"
+#include "psi/algorithm/kwpir/index_pir.h"
+#include "psi/algorithm/sealpir/seal_pir_utils.h"
 
-#include "psi/sealpir/serializable.pb.h"
+#include "psi/algorithm/sealpir/serializable.pb.h"
 
 namespace psi::sealpir {
 
@@ -214,4 +214,4 @@ class SealPirClient : public SealPir, public psi::kwpir::IndexPirClient {
 
   friend class SealPirServer;
 };
-}  // namespace psi::sealpir
\ No newline at end of file
+}  // namespace psi::sealpir
diff --git a/psi/sealpir/seal_pir_test.cc b/psi/algorithm/sealpir/seal_pir_test.cc
similarity index 98%
rename from psi/sealpir/seal_pir_test.cc
rename to psi/algorithm/sealpir/seal_pir_test.cc
index c7cf612..a94750f 100644
--- a/psi/sealpir/seal_pir_test.cc
+++ b/psi/algorithm/sealpir/seal_pir_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/sealpir/seal_pir.h"
+#include "psi/algorithm/sealpir/seal_pir.h"
 
 #include <seal/seal.h>
 
@@ -163,4 +163,4 @@ INSTANTIATE_TEST_SUITE_P(
                     TestParams{4096, 1 << 18, 10, 0, 2, 20, true},
                     TestParams{4096, 1000, 288, 100, 2, 20, true},
                     TestParams{8192, 1000, 288, 0, 2, 20, true}));
-}  // namespace psi::sealpir
\ No newline at end of file
+}  // namespace psi::sealpir
diff --git a/psi/sealpir/seal_pir_utils.cc b/psi/algorithm/sealpir/seal_pir_utils.cc
similarity index 96%
rename from psi/sealpir/seal_pir_utils.cc
rename to psi/algorithm/sealpir/seal_pir_utils.cc
index b871d0c..0d8ba27 100644
--- a/psi/sealpir/seal_pir_utils.cc
+++ b/psi/algorithm/sealpir/seal_pir_utils.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/sealpir/seal_pir_utils.h"
+#include "psi/algorithm/sealpir/seal_pir_utils.h"
 
 #include "yacl/base/exception.h"
 
@@ -59,4 +59,4 @@ std::vector<seal::Plaintext> MemoryDbPlaintextStore::ReadPlaintexts(
   return db_vec_[sub_db_index];
 }
 
-}  // namespace psi::sealpir
\ No newline at end of file
+}  // namespace psi::sealpir
diff --git a/psi/sealpir/seal_pir_utils.h b/psi/algorithm/sealpir/seal_pir_utils.h
similarity index 100%
rename from psi/sealpir/seal_pir_utils.h
rename to psi/algorithm/sealpir/seal_pir_utils.h
diff --git a/psi/sealpir/serializable.proto b/psi/algorithm/sealpir/serializable.proto
similarity index 100%
rename from psi/sealpir/serializable.proto
rename to psi/algorithm/sealpir/serializable.proto
diff --git a/psi/algorithm/spiral/BUILD.bazel b/psi/algorithm/spiral/BUILD.bazel
new file mode 100644
index 0000000..1056634
--- /dev/null
+++ b/psi/algorithm/spiral/BUILD.bazel
@@ -0,0 +1,172 @@
+# Copyright 2024 Ant Group Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load("//bazel:psi.bzl", "psi_cc_library", "psi_cc_test")
+
+package(default_visibility = ["//visibility:public"])
+
+psi_cc_library(
+    name = "common",
+    hdrs = ["common.h"],
+    deps = [
+        "@abseil-cpp//absl/types:span",
+    ],
+)
+
+psi_cc_library(
+    name = "util",
+    srcs = ["util.cc"],
+    hdrs = ["util.h"],
+    deps = [
+        ":params",
+    ],
+)
+
+psi_cc_library(
+    name = "params",
+    srcs = ["params.cc"],
+    hdrs = ["params.h"],
+    deps = [
+        ":common",
+        "//psi/algorithm/spiral/arith",
+        "//psi/algorithm/spiral/arith:ntt_table",
+        "//psi/algorithm/spiral/arith:number_theory",
+        "@yacl//yacl/base:exception",
+        "@yacl//yacl/base:int128",
+        "@yacl//yacl/crypto/hash:blake3",
+        "@yacl//yacl/utils:elapsed_timer",
+    ],
+)
+
+psi_cc_test(
+    name = "params_test",
+    srcs = ["params_test.cc"],
+    deps = [
+        ":common",
+        ":params",
+        ":util",
+        "//psi/algorithm/spiral/arith:ntt_table",
+    ],
+)
+
+psi_cc_library(
+    name = "poly_matrix",
+    srcs = ["poly_matrix.cc"],
+    hdrs = ["poly_matrix.h"],
+    copts = ["-mavx2"],
+    deps = [
+        ":params",
+        ":util",
+        "//psi/algorithm/spiral/arith:arith_params",
+        "//psi/algorithm/spiral/arith:ntt",
+        "@abseil-cpp//absl/strings",
+        "@abseil-cpp//absl/types:span",
+        "@yacl//yacl/base:aligned_vector",
+        "@yacl//yacl/base:exception",
+        "@yacl//yacl/base:int128",
+        "@yacl//yacl/crypto/rand",
+        "@yacl//yacl/crypto/tools:prg",
+        "@yacl//yacl/utils:parallel",
+    ] + select({
+        "@platforms//cpu:aarch64": [
+            "@sse2neon",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+psi_cc_library(
+    name = "poly_matrix_utils",
+    srcs = ["poly_matrix_utils.cc"],
+    hdrs = ["poly_matrix_utils.h"],
+    copts = ["-mavx2"],
+    deps = [
+        "poly_matrix",
+        ":params",
+        ":util",
+        "//psi/algorithm/spiral/arith:arith_params",
+        "//psi/algorithm/spiral/arith:ntt",
+        "@abseil-cpp//absl/strings",
+        "@abseil-cpp//absl/types:span",
+        "@yacl//yacl/base:aligned_vector",
+        "@yacl//yacl/base:exception",
+        "@yacl//yacl/base:int128",
+        "@yacl//yacl/crypto/rand",
+        "@yacl//yacl/crypto/tools:prg",
+        "@yacl//yacl/utils:parallel",
+    ],
+)
+
+psi_cc_test(
+    name = "poly_matrix_test",
+    srcs = ["poly_matrix_test.cc"],
+    copts = ["-mavx2"],
+    deps = [
+        "poly_matrix",
+        "poly_matrix_utils",
+        ":common",
+        ":params",
+        ":util",
+        "//psi/algorithm/spiral/arith:ntt_table",
+        "@abseil-cpp//absl/types:span",
+        "@seal",
+        "@yacl//yacl/base:buffer",
+        "@yacl//yacl/utils:elapsed_timer",
+        "@yacl//yacl/utils:parallel",
+    ],
+)
+
+psi_cc_library(
+    name = "discrete_gaussian",
+    srcs = ["discrete_gaussian.cc"],
+    hdrs = ["discrete_gaussian.h"],
+    deps = [
+        ":poly_matrix",
+        "@yacl//yacl/crypto/rand",
+        "@yacl//yacl/crypto/tools:prg",
+    ],
+)
+
+psi_cc_test(
+    name = "discrete_gaussian_test",
+    srcs = ["discrete_gaussian_test.cc"],
+    deps = [
+        ":discrete_gaussian",
+        ":params",
+        ":poly_matrix",
+        ":util",
+        "@yacl//yacl/crypto/rand",
+        "@yacl//yacl/crypto/tools:prg",
+    ],
+)
+
+psi_cc_library(
+    name = "gadget",
+    srcs = ["gadget.cc"],
+    hdrs = ["gadget.h"],
+    deps = [
+        ":params",
+        ":poly_matrix",
+    ],
+)
+
+psi_cc_test(
+    name = "gadget_test",
+    srcs = ["gadget_test.cc"],
+    deps = [
+        ":gadget",
+        ":params",
+        ":util",
+    ],
+)
diff --git a/psi/algorithm/spiral/README.md b/psi/algorithm/spiral/README.md
new file mode 100644
index 0000000..7ed8299
--- /dev/null
+++ b/psi/algorithm/spiral/README.md
@@ -0,0 +1,4 @@
+
+This is a C++ implementation of [Spiral Fast, High Rate Single Server PIR via FHE Composition](https://eprint.iacr.org/2022/368).
+
+We referred to the [Rust implementation](https://github.com/blyssprivacy/sdk/tree/main/lib/spiral-rs) corresponding to this paper.
\ No newline at end of file
diff --git a/psi/algorithm/spiral/arith/BUILD.bazel b/psi/algorithm/spiral/arith/BUILD.bazel
new file mode 100644
index 0000000..c86f328
--- /dev/null
+++ b/psi/algorithm/spiral/arith/BUILD.bazel
@@ -0,0 +1,112 @@
+load("//bazel:psi.bzl", "psi_cc_library", "psi_cc_test")
+
+package(default_visibility = ["//visibility:public"])
+
+psi_cc_library(
+    name = "arith",
+    hdrs = ["arith.h"],
+    deps = [
+        "//psi/algorithm/spiral:common",
+        "@abseil-cpp//absl/strings",
+        "@seal",
+        "@yacl//yacl/base:exception",
+        "@yacl//yacl/base:int128",
+        "@yacl//yacl/math:gadget",
+    ],
+)
+
+psi_cc_library(
+    name = "arith_params",
+    hdrs = ["arith_params.h"],
+    deps = [
+        ":arith",
+        "//psi/algorithm/spiral:common",
+        "//psi/algorithm/spiral:params",
+        "@abseil-cpp//absl/strings",
+        "@seal",
+        "@yacl//yacl/base:exception",
+        "@yacl//yacl/base:int128",
+    ],
+)
+
+psi_cc_library(
+    name = "number_theory",
+    hdrs = ["number_theory.h"],
+    deps = [
+        ":arith",
+        "//psi/algorithm/spiral:common",
+        "@abseil-cpp//absl/strings",
+        "@seal",
+        "@yacl//yacl/base:exception",
+    ],
+)
+
+psi_cc_library(
+    name = "ntt_table",
+    srcs = ["ntt_table.cc"],
+    hdrs = ["ntt_table.h"],
+    deps = [
+        ":arith",
+        ":number_theory",
+        "//psi/algorithm/spiral:common",
+        "@seal",
+        "@yacl//yacl/base:exception",
+    ],
+)
+
+psi_cc_library(
+    name = "ntt",
+    srcs = ["ntt.cc"],
+    hdrs = ["ntt.h"],
+    copts = ["-mavx2"],
+    deps = [
+        ":arith",
+        ":ntt_table",
+        ":number_theory",
+        "//psi/algorithm/spiral:params",
+        "@abseil-cpp//absl/types:span",
+        "@seal",
+        "@yacl//yacl/base:aligned_vector",
+        "@yacl//yacl/base:exception",
+        "@yacl//yacl/utils:parallel",
+    ] + select({
+        "@platforms//cpu:aarch64": [
+            "@sse2neon",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+psi_cc_test(
+    name = "arith_test",
+    srcs = ["arith_test.cc"],
+    deps = [
+        ":arith",
+        ":arith_params",
+        "//psi/algorithm/spiral:util",
+        "@abseil-cpp//absl/strings",
+        "@seal",
+    ],
+)
+
+psi_cc_test(
+    name = "number_theory_test",
+    srcs = ["number_theory_test.cc"],
+    deps = [
+        ":number_theory",
+    ],
+)
+
+psi_cc_test(
+    name = "ntt_table_test",
+    srcs = ["ntt_table_test.cc"],
+    copts = ["-mavx2"],
+    deps = [
+        ":ntt",
+        ":ntt_table",
+        "//psi/algorithm/spiral:params",
+        "//psi/algorithm/spiral:util",
+        "@abseil-cpp//absl/types:span",
+        "@yacl//yacl/base:aligned_vector",
+    ],
+)
diff --git a/psi/algorithm/spiral/arith/arith.h b/psi/algorithm/spiral/arith/arith.h
new file mode 100644
index 0000000..90bdbc7
--- /dev/null
+++ b/psi/algorithm/spiral/arith/arith.h
@@ -0,0 +1,235 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "seal/seal.h"
+#include "seal/util/common.h"
+#include "seal/util/uintarith.h"
+#include "seal/util/uintarithsmallmod.h"
+#include "yacl/base/exception.h"
+#include "yacl/base/int128.h"
+#include "yacl/math/gadget.h"
+
+#include "psi/algorithm/spiral/common.h"
+
+namespace psi::spiral::arith {
+
+inline std::uint64_t Log2(std::uint64_t a) { return yacl::math::Log2Floor(a); }
+
+inline std::uint64_t Log2Ceil(std::uint64_t a) {
+  return yacl::math::Log2Ceil(a);
+}
+
+inline std::pair<std::uint64_t, std::uint64_t> GetBarrettCrs(
+    std::uint64_t modulus) {
+  // represent 2^{128}
+  std::array<std::uint64_t, 3> numerator{0, 0, 1};
+  std::array<std::uint64_t, 3> quotient{0, 0, 0};
+  seal::util::divide_uint192_inplace(numerator.data(), modulus,
+                                     quotient.data());
+  // barrett redeuce precomputation
+  return std::make_pair(quotient[0], quotient[1]);
+}
+
+inline std::pair<std::array<std::uint64_t, kMaxModuli>,
+                 std::array<std::uint64_t, kMaxModuli>>
+GetBarrett(const std::vector<std::uint64_t>& moduli) {
+  std::array<std::uint64_t, kMaxModuli> cr0{0, 0, 0, 0};
+  std::array<std::uint64_t, kMaxModuli> cr1{0, 0, 0, 0};
+
+  for (std::size_t i = 0; i < moduli.size(); ++i) {
+    std::pair<std::uint64_t, std::uint64_t> crs = GetBarrettCrs(moduli[i]);
+    cr0[i] = crs.first;
+    cr1[i] = crs.second;
+  }
+  return std::make_pair(cr0, cr1);
+}
+
+inline std::uint64_t ExponentitateUintMod(std::uint64_t operand,
+                                          std::uint64_t exponent,
+                                          std::uint64_t modulus) {
+  seal::Modulus mod(modulus);
+  return seal::util::exponentiate_uint_mod(operand, exponent, mod);
+}
+
+inline std::uint64_t ExponentitateUintMod(std::uint64_t operand,
+                                          std::uint64_t exponent,
+                                          const seal::Modulus& mod) {
+  return seal::util::exponentiate_uint_mod(operand, exponent, mod);
+}
+
+inline std::uint64_t ReverseBits(std::uint64_t x, std::uint64_t bit_count) {
+  if (bit_count == 0) {
+    return 0;
+  }
+  return seal::util::reverse_bits<std::uint64_t>(x, bit_count);
+}
+
+inline std::uint64_t Div2UintMod(std::uint64_t operand, std::uint64_t modulus) {
+  seal::Modulus mod(modulus);
+  return seal::util::div2_uint_mod(operand, mod);
+}
+
+inline std::uint64_t Div2UintMod(std::uint64_t operand,
+                                 const seal::Modulus& mod) {
+  return seal::util::div2_uint_mod(operand, mod);
+}
+
+inline std::uint64_t Recenter(std::uint64_t val, std::uint64_t from_modulus,
+                              std::uint64_t to_modulus) {
+  YACL_ENFORCE(from_modulus >= to_modulus);
+
+  auto from_modulus_i64 = static_cast<std::int64_t>(from_modulus);
+  auto to_modulus_i64 = static_cast<std::int64_t>(to_modulus);
+  auto a_val = static_cast<std::int64_t>(val);
+
+  if (val >= from_modulus / 2) {
+    a_val -= from_modulus_i64;
+  }
+
+  a_val = a_val + (from_modulus_i64 / to_modulus_i64) * to_modulus_i64 +
+          2 * to_modulus_i64;
+  a_val %= to_modulus_i64;
+
+  return static_cast<std::uint64_t>(a_val);
+}
+
+inline std::uint64_t BarrettRawU64(std::uint64_t input,
+                                   std::uint64_t const_ratio_1,
+                                   std::uint64_t modulus) {
+  std::uint64_t tmp = 0ULL;
+  seal::util::multiply_uint64_hw64(input, const_ratio_1,
+                                   reinterpret_cast<unsigned long long*>(&tmp));
+
+  std::uint64_t res = input - (tmp * modulus);
+
+  return res >= modulus ? res - modulus : res;
+}
+
+inline std::uint64_t BarrettRawU128(uint128_t val, std::uint64_t cr0,
+                                    std::uint64_t cr1, std::uint64_t modulus) {
+  auto [h64, l64] = yacl::DecomposeUInt128(val);
+
+  std::uint64_t tmp1 = 0ULL;
+  std::uint64_t tmp3 = 0ULL;
+  // seal api need unsigned long long type
+  unsigned long long carry = 0ULL;
+  // std::array<std::uint64_t, 2> tmp2 = {0ULL, 0ULL};
+  unsigned long long tmp2[2]{0ULL, 0ULL};
+  // Round 1
+  // (x0 * m0)_1 , 即 x0 * m0 的高 64 bits
+  seal::util::multiply_uint64_hw64(l64, cr0, &carry);
+  // tmp2 = [(x0 * m1)_0, (x0 * m1)_1]
+  seal::util::multiply_uint64(l64, cr1, tmp2);
+
+  tmp3 = tmp2[1] + seal::util::add_uint64(tmp2[0], carry, &tmp1);
+
+  // Round2
+  seal::util::multiply_uint64(h64, cr0, tmp2);
+  carry = tmp2[1] + seal::util::add_uint64(tmp1, tmp2[0], &tmp1);
+  // This is all we care about
+  tmp1 = h64 * cr1 + tmp3 + carry;
+
+  // reduction
+  tmp3 = l64 - tmp1 * modulus;
+  // this is a lazy result \in [0, 2*modulus)
+  return tmp3;
+}
+
+inline std::uint64_t BarrettReductionU128Raw(uint128_t val, std::uint64_t cr0,
+                                             std::uint64_t cr1,
+                                             std::uint64_t modulus) {
+  std::uint64_t reduced_val = BarrettRawU128(val, cr0, cr1, modulus);
+  reduced_val -= (modulus) * static_cast<std::uint64_t>(reduced_val >= modulus);
+  return reduced_val;
+}
+
+inline std::uint64_t RecenertMod(std::uint64_t val, std::uint64_t small_modulus,
+                                 std::uint64_t large_modulus) {
+  YACL_ENFORCE_LT(val, small_modulus);
+
+  auto val_i64 = static_cast<std::int64_t>(val);
+  auto small_modulus_i64 = static_cast<std::int64_t>(small_modulus);
+  auto large_modulus_i64 = static_cast<std::int64_t>(large_modulus);
+
+  if (val_i64 > (small_modulus_i64 / 2)) {
+    val_i64 -= small_modulus_i64;
+  }
+  if (val_i64 < 0) {
+    val_i64 += large_modulus_i64;
+  }
+  return static_cast<std::uint64_t>(val_i64);
+}
+
+inline std::uint64_t Rescale(std::uint64_t a, std::uint64_t in_mod,
+                             std::uint64_t out_mod) {
+  auto in_mod_i64 = static_cast<std::int64_t>(in_mod);
+  int128_t in_mod_i128 = yacl::MakeInt128(0, in_mod);
+  int128_t out_mod_i128 = yacl::MakeInt128(0, out_mod);
+
+  auto in_val = static_cast<std::int64_t>(a % in_mod);
+  if (in_val >= (in_mod_i64 / 2)) {
+    in_val -= in_mod_i64;
+  }
+  std::int64_t sign = (in_val >= 0) ? 1 : -1;
+  // int64_t can directly mul int128_t
+  // do need to firstly convert to
+  int128_t val = in_val * out_mod_i128;
+
+  // val + int64_t = int128_t + int64_t, this is ok
+  int128_t result = (val + sign * (in_mod_i64 / 2)) / in_mod_i128;
+
+  // if the low-64 bit's type is int64_t, you must be carefully use MakeInt128
+  int128_t tmp = yacl::MakeInt128(0, (in_mod / out_mod) * out_mod);
+  result = (result + tmp + (2 * out_mod_i128)) % out_mod_i128;
+
+  YACL_ENFORCE(result >= 0);
+
+  result = (result + out_mod_i128) % out_mod_i128;
+  auto last_result = yacl::DecomposeInt128(result).second;
+
+  return last_result;
+}
+
+inline std::uint64_t MultiplyUintMod(std::uint64_t a, std::uint64_t b,
+                                     std::uint64_t modulus) {
+  seal::Modulus mod(modulus);
+  return seal::util::multiply_uint_mod(a, b, mod);
+}
+
+inline std::uint64_t MultiplyUintMod(std::uint64_t a, std::uint64_t b,
+                                     const seal::Modulus& mod) {
+  return seal::util::multiply_uint_mod(a, b, mod);
+}
+
+inline std::uint64_t MultiplyUintMod(std::uint64_t a, std::uint64_t b,
+                                     std::uint64_t modulus,
+                                     uint64_t barrett_cr0,
+                                     uint64_t barrett_cr1) {
+  unsigned long long z[2] = {0ULL, 0ULL};
+  seal::util::multiply_uint64(a, b, z);
+  uint128_t z128 = yacl::MakeUint128(z[1], z[0]);
+  return BarrettReductionU128Raw(z128, barrett_cr0, barrett_cr1, modulus);
+}
+
+inline size_t UintNum(size_t len, size_t uint_len) {
+  return (len + uint_len - 1) / uint_len;
+}
+
+}  // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/arith/arith_params.h b/psi/algorithm/spiral/arith/arith_params.h
new file mode 100644
index 0000000..cbbf6c7
--- /dev/null
+++ b/psi/algorithm/spiral/arith/arith_params.h
@@ -0,0 +1,66 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <cstdint>
+
+#include "yacl/base/int128.h"
+
+#include "psi/algorithm/spiral/arith/arith.h"
+#include "psi/algorithm/spiral/params.h"
+
+namespace psi::spiral::arith {
+
+inline std::uint64_t BarrettU64(const Params& params, std::uint64_t val) {
+  return BarrettRawU64(val, params.BarrettCr1Modulus(), params.Modulus());
+}
+
+inline std::uint64_t BarrettCoeffU64(const Params& params, std::uint64_t val,
+                                     std::size_t moduli_idx) {
+  return BarrettRawU64(val, params.BarrettCr1(moduli_idx),
+                       params.Moduli(moduli_idx));
+}
+
+inline std::uint64_t BarrettReductionU128(const Params& params, uint128_t val) {
+  return BarrettReductionU128Raw(val, params.BarrettCr0Modulus(),
+                                 params.BarrettCr1Modulus(), params.Modulus());
+}
+
+inline std::uint64_t MultiplyModular(const Params& params, std::uint64_t a,
+                                     std::uint64_t b, std::size_t moduli_idx) {
+  return BarrettCoeffU64(params, a * b, moduli_idx);
+}
+
+inline std::uint64_t MultiplyAddModular(const Params& params, std::uint64_t a,
+                                        std::uint64_t b, std::uint64_t x,
+                                        std::size_t moduli_idx) {
+  return BarrettCoeffU64(params, a * b + x, moduli_idx);
+}
+
+inline std::uint64_t AddModular(const Params& params, std::uint64_t a,
+                                std::uint64_t b, std::size_t moduli_idx) {
+  return BarrettCoeffU64(params, a + b, moduli_idx);
+}
+
+inline std::uint64_t InvertModular(const Params& params, std::uint64_t a,
+                                   std::size_t moduli_idx) {
+  return params.Moduli(moduli_idx) - a;
+}
+
+inline std::uint64_t ModularReduce(const Params& params, std::uint64_t a,
+                                   std::size_t moduli_idx) {
+  return BarrettCoeffU64(params, a, moduli_idx);
+}
+
+}  // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/arith/arith_test.cc b/psi/algorithm/spiral/arith/arith_test.cc
new file mode 100644
index 0000000..af7cbe2
--- /dev/null
+++ b/psi/algorithm/spiral/arith/arith_test.cc
@@ -0,0 +1,227 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/arith/arith.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <random>
+
+#include "gtest/gtest.h"
+
+#include "psi/algorithm/spiral/arith/arith_params.h"
+#include "psi/algorithm/spiral/util.h"
+
+namespace psi::spiral::arith {
+
+namespace {
+constexpr std::size_t kMaxLoop = 1000;
+}
+
+TEST(ArithTest, MultiplyUintMod) {
+  std::uint64_t mod{2};
+  ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 0, mod));
+  ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 1, mod));
+  ASSERT_EQ(0ULL, arith::MultiplyUintMod(1, 0, mod));
+  ASSERT_EQ(1ULL, arith::MultiplyUintMod(1, 1, mod));
+
+  auto [cr0, cr1] = arith::GetBarrettCrs(mod);
+  ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 0, mod, cr0, cr1));
+  ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 1, mod, cr0, cr1));
+  ASSERT_EQ(0ULL, arith::MultiplyUintMod(1, 0, mod, cr0, cr1));
+  ASSERT_EQ(1ULL, arith::MultiplyUintMod(1, 1, mod, cr0, cr1));
+
+  mod = 10;
+  ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 0, mod));
+  ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 1, mod));
+  ASSERT_EQ(0ULL, arith::MultiplyUintMod(1, 0, mod));
+  ASSERT_EQ(1ULL, arith::MultiplyUintMod(1, 1, mod));
+  ASSERT_EQ(9ULL, arith::MultiplyUintMod(7, 7, mod));
+  ASSERT_EQ(2ULL, arith::MultiplyUintMod(6, 7, mod));
+  ASSERT_EQ(2ULL, arith::MultiplyUintMod(7, 6, mod));
+
+  mod = 2305843009211596801ULL;
+  ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 0, mod));
+  ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 1, mod));
+  ASSERT_EQ(0ULL, arith::MultiplyUintMod(1, 0, mod));
+  ASSERT_EQ(1ULL, arith::MultiplyUintMod(1, 1, mod));
+  ASSERT_EQ(576460752302899200ULL,
+            arith::MultiplyUintMod(1152921504605798400ULL,
+                                   1152921504605798401ULL, mod));
+  ASSERT_EQ(576460752302899200ULL,
+            arith::MultiplyUintMod(1152921504605798401ULL,
+                                   1152921504605798400ULL, mod));
+  ASSERT_EQ(1729382256908697601ULL,
+            arith::MultiplyUintMod(1152921504605798401ULL,
+                                   1152921504605798401ULL, mod));
+  ASSERT_EQ(1ULL, arith::MultiplyUintMod(2305843009211596800ULL,
+                                         2305843009211596800ULL, mod));
+
+  auto [cr00, cr11] = arith::GetBarrettCrs(mod);
+  ASSERT_EQ(576460752302899200ULL,
+            arith::MultiplyUintMod(1152921504605798400ULL,
+                                   1152921504605798401ULL, mod, cr00, cr11));
+  ASSERT_EQ(576460752302899200ULL,
+            arith::MultiplyUintMod(1152921504605798401ULL,
+                                   1152921504605798400ULL, mod, cr00, cr11));
+  ASSERT_EQ(1729382256908697601ULL,
+            arith::MultiplyUintMod(1152921504605798401ULL,
+                                   1152921504605798401ULL, mod, cr00, cr11));
+  ASSERT_EQ(1ULL,
+            arith::MultiplyUintMod(2305843009211596800ULL,
+                                   2305843009211596800ULL, mod, cr00, cr11));
+}
+
+TEST(ArithTest, ReverseBits) {
+  ASSERT_EQ(0ULL, arith::ReverseBits(0ULL, 0));
+  ASSERT_EQ(0ULL, arith::ReverseBits(0ULL, 1));
+  ASSERT_EQ(0ULL, arith::ReverseBits(0ULL, 32));
+  ASSERT_EQ(0ULL, arith::ReverseBits(0ULL, 64));
+
+  ASSERT_EQ(0ULL, arith::ReverseBits(1ULL, 0));
+  ASSERT_EQ(1ULL, arith::ReverseBits(1ULL, 1));
+  ASSERT_EQ(1ULL << 31, arith::ReverseBits(1ULL, 32));
+  ASSERT_EQ(1ULL << 63, arith::ReverseBits(1ULL, 64));
+
+  ASSERT_EQ(0ULL, arith::ReverseBits(1ULL << 31, 0));
+  ASSERT_EQ(0ULL, arith::ReverseBits(1ULL << 31, 1));
+  ASSERT_EQ(1ULL, arith::ReverseBits(1ULL << 31, 32));
+  ASSERT_EQ(1ULL << 32, arith::ReverseBits(1ULL << 31, 64));
+
+  ASSERT_EQ(0ULL, arith::ReverseBits(0xFFFFULL << 16, 0));
+  ASSERT_EQ(0ULL, arith::ReverseBits(0xFFFFULL << 16, 1));
+  ASSERT_EQ(0xFFFFULL, arith::ReverseBits(0xFFFFULL << 16, 32));
+  ASSERT_EQ(0xFFFFULL << 32, arith::ReverseBits(0xFFFFULL << 16, 64));
+
+  ASSERT_EQ(0ULL, arith::ReverseBits(0x0000FFFFFFFF0000ULL, 0));
+  ASSERT_EQ(0ULL, arith::ReverseBits(0x0000FFFFFFFF0000ULL, 1));
+  ASSERT_EQ(0xFFFFULL, arith::ReverseBits(0x0000FFFFFFFF0000ULL, 32));
+  ASSERT_EQ(0x0000FFFFFFFF0000ULL,
+            arith::ReverseBits(0x0000FFFFFFFF0000ULL, 64));
+
+  ASSERT_EQ(0ULL, arith::ReverseBits(0xFFFF0000FFFF0000ULL, 0));
+  ASSERT_EQ(0ULL, arith::ReverseBits(0xFFFF0000FFFF0000ULL, 1));
+  ASSERT_EQ(0xFFFFULL, arith::ReverseBits(0xFFFF0000FFFF0000ULL, 32));
+  ASSERT_EQ(0x0000FFFF0000FFFFULL,
+            arith::ReverseBits(0xFFFF0000FFFF0000ULL, 64));
+}
+
+TEST(ArithTest, BarrettRawU64) {
+  std::uint64_t mod{10};
+  auto const_ratio = arith::GetBarrettCrs(mod);
+
+  ASSERT_EQ(0, arith::BarrettRawU64(0, const_ratio.second, mod));
+  ASSERT_EQ(1, arith::BarrettRawU64(1, const_ratio.second, mod));
+  ASSERT_EQ(8, arith::BarrettRawU64(8, const_ratio.second, mod));
+  ASSERT_EQ(7, arith::BarrettRawU64(1234567, const_ratio.second, mod));
+  ASSERT_EQ(0, arith::BarrettRawU64(12345670, const_ratio.second, mod));
+
+  mod = 66974689739603969ULL;
+  std::uint64_t cr1 = 275ULL;
+
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+  for (std::size_t i = 0; i < kMaxLoop; ++i) {
+    std::uint64_t val = rng();
+    ASSERT_EQ(val % mod, arith::BarrettRawU64(val, cr1, mod));
+  }
+}
+
+TEST(ArithTest, Div2UintMod) { ASSERT_EQ(5, arith::Div2UintMod(3, 7)); }
+
+TEST(ArithTest, GetBarrettCrs) {
+  std::pair<std::uint64_t, std::uint64_t> expected =
+      std::make_pair(16144578669088582089ULL, 68736257792ULL);
+  ASSERT_EQ(expected, arith::GetBarrettCrs(268369921ULL));
+
+  expected = std::make_pair(10966983149909726427ULL, 73916747789ULL);
+  ASSERT_EQ(expected, arith::GetBarrettCrs(249561089ULL));
+
+  expected = std::make_pair(7906011006380390721ULL, 275ULL);
+  ASSERT_EQ(expected, arith::GetBarrettCrs(66974689739603969ULL));
+}
+TEST(ArithTest, BarrettReductionU128Raw) {
+  std::uint64_t modulus = 66974689739603969ULL;
+  uint128_t modulus_u128 = yacl::MakeUint128(0ULL, modulus);
+
+  std::function<std::uint64_t(std::uint64_t)> exec = [](uint128_t val) {
+    return BarrettReductionU128Raw(val, 7906011006380390721ULL, 275ULL,
+                                   66974689739603969ULL);
+  };
+
+  ASSERT_EQ(0, exec(modulus_u128));
+  ASSERT_EQ(1, exec(modulus_u128 + 1));
+  ASSERT_EQ(5, exec((modulus_u128 * 7) + 5));
+
+  std::random_device rd;
+  std::mt19937 rng(rd());
+  for (std::size_t i = 0; i < kMaxLoop; ++i) {
+    std::uint64_t val = rng();
+    uint128_t val_u128 = yacl::MakeUint128(0ULL, val);
+    ASSERT_EQ(val % modulus, exec(val_u128));
+  }
+  // compare with seal::util::barrett_reduce_128
+  modulus = 13131313131313ULL;
+  auto const_ratio = GetBarrettCrs(modulus);
+  seal::Modulus mod(modulus);
+
+  ASSERT_EQ(const_ratio.first, mod.const_ratio()[0]);
+  ASSERT_EQ(const_ratio.second, mod.const_ratio()[1]);
+
+  uint128_t val = yacl::MakeUint128(0, 0);
+  ASSERT_EQ(0, BarrettReductionU128Raw(val, const_ratio.first,
+                                       const_ratio.second, modulus));
+
+  val = yacl::MakeUint128(0, 1);
+  ASSERT_EQ(1, BarrettReductionU128Raw(val, const_ratio.first,
+                                       const_ratio.second, modulus));
+
+  val = yacl::MakeUint128(456, 123);
+  ASSERT_EQ(8722750765283ULL,
+            BarrettReductionU128Raw(val, const_ratio.first, const_ratio.second,
+                                    modulus));
+
+  val = yacl::MakeUint128(79797979797979, 24242424242424);
+  ASSERT_EQ(1010101010101ULL,
+            BarrettReductionU128Raw(val, const_ratio.first, const_ratio.second,
+                                    modulus));
+}
+
+TEST(ArithTest, Rescale) {
+  ASSERT_EQ(4, Rescale(3, 17, 21));
+  ASSERT_EQ(2, Rescale(3, 21, 17));
+  ASSERT_EQ(1, Rescale(1, 17, 21));
+
+  std::uint64_t in_mod = 0x7fffffd8001ULL;
+  std::uint64_t out_mod = 0x7fffffc8001ULL;
+
+  EXPECT_EQ(Rescale(2721421219ULL, in_mod, out_mod), 2721421199ULL);
+  EXPECT_EQ(Rescale(2093223862ULL, in_mod, out_mod), 2093223846ULL);
+  EXPECT_EQ(Rescale(3304378079ULL, in_mod, out_mod), 3304378054ULL);
+  EXPECT_EQ(Rescale(3286543357ULL, in_mod, out_mod), 3286543333ULL);
+  EXPECT_EQ(Rescale(1506336168ULL, in_mod, out_mod), 1506336157ULL);
+  EXPECT_EQ(Rescale(3294507908ULL, in_mod, out_mod), 3294507883ULL);
+  EXPECT_EQ(Rescale(3602954393ULL, in_mod, out_mod), 3602954366ULL);
+  EXPECT_EQ(Rescale(3268316190ULL, in_mod, out_mod), 3268316166ULL);
+  EXPECT_EQ(Rescale(3730398221ULL, in_mod, out_mod), 3730398193ULL);
+  EXPECT_EQ(Rescale(3537330165ULL, in_mod, out_mod), 3537330139ULL);
+
+  std::uint64_t modulus = 66974689739603969ULL;
+  std::uint64_t pt_modulus = 256;
+  std::uint64_t in = 34795444278750647ULL;
+  EXPECT_EQ(133, Rescale(in, modulus, pt_modulus));
+}
+
+}  // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/arith/ntt.cc b/psi/algorithm/spiral/arith/ntt.cc
new file mode 100644
index 0000000..9acb795
--- /dev/null
+++ b/psi/algorithm/spiral/arith/ntt.cc
@@ -0,0 +1,373 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/arith/ntt.h"
+
+#ifdef __x86_64__
+#include <immintrin.h>
+#elif defined(__aarch64__)
+#include "sse2neon.h"
+#endif
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+#include "absl/types/span.h"
+#include "spdlog/spdlog.h"
+#include "yacl/base/aligned_vector.h"
+#include "yacl/base/exception.h"
+#include "yacl/utils/parallel.h"
+
+#include "psi/algorithm/spiral/arith/number_theory.h"
+
+namespace psi::spiral::arith {
+
+#ifndef __AVX2__
+
+void NttForward(const Params& params, absl::Span<uint64_t> operand_overall) {
+  std::size_t log_n = params.PolyLenLog2();
+  std::size_t n = static_cast<size_t>(1) << log_n;
+
+  for (std::size_t coeff_mod = 0; coeff_mod < params.CrtCount(); ++coeff_mod) {
+    auto operand = operand_overall.subspan(coeff_mod * n, n);
+
+    const auto& forward_table = params.GetNttForwardTable(coeff_mod);
+    const auto& forward_table_prime = params.GetNttForwardPrimeTable(coeff_mod);
+    // why need to convert uint32
+    auto modulus_small = static_cast<std::uint32_t>(params.Moduli(coeff_mod));
+    std::uint32_t two_times_modulus_small = 2 * modulus_small;
+
+    for (std::size_t mm = 0; mm < log_n; ++mm) {
+      std::size_t m = 1 << mm;
+      std::size_t t = n >> (mm + 1);
+
+      for (std::size_t i = 0; i < m; ++i) {
+        uint64_t w = forward_table[m + i];
+        uint64_t w_prime = forward_table_prime[m + i];
+
+        auto op = operand.subspan(i * (2 * t), 2 * t);
+
+        for (std::size_t j = 0; j < t; ++j) {
+          std::uint32_t x = static_cast<std::uint32_t>(op[j]);
+          std::uint32_t y = static_cast<std::uint32_t>(op[t + j]);
+
+          std::uint32_t curr_x =
+              x - (two_times_modulus_small *
+                   static_cast<std::uint32_t>(x >= two_times_modulus_small));
+          std::uint64_t q_tmp = (static_cast<std::uint64_t>(y) *
+                                 static_cast<std::uint64_t>(w_prime)) >>
+                                32;
+          std::uint64_t q_new =
+              w * static_cast<std::uint64_t>(y) -
+              q_tmp * static_cast<std::uint64_t>(modulus_small);
+
+          op[j] = curr_x + q_new;
+          op[t + j] =
+              curr_x + (static_cast<uint64_t>(two_times_modulus_small) - q_new);
+        }
+      }
+
+      // Update the operand with modulus constraints
+      for (std::size_t i = 0; i < n; ++i) {
+        operand[i] -=
+            static_cast<std::uint64_t>(operand[i] >= two_times_modulus_small) *
+            two_times_modulus_small;
+        operand[i] -= static_cast<std::uint64_t>(operand[i] >= modulus_small) *
+                      modulus_small;
+      }
+    }
+  }
+}
+
+// AVX2 version of ntt_forward
+#else
+
+void NttForward(const Params& params, absl::Span<uint64_t> operand_overall) {
+  SPDLOG_DEBUG("using AVX2 NttForward");
+
+  std::size_t log_n = params.PolyLenLog2();
+  std::size_t n = static_cast<size_t>(1) << log_n;
+
+  YACL_ENFORCE(operand_overall.size() >= params.CrtCount() * n);
+
+  for (std::size_t coeff_mod = 0; coeff_mod < params.CrtCount(); ++coeff_mod) {
+    auto operand = operand_overall.subspan(coeff_mod * n, n);
+
+    const auto& forward_table = params.GetNttForwardTable(coeff_mod);
+    const auto& forward_table_prime = params.GetNttForwardPrimeTable(coeff_mod);
+    auto modulus_small = static_cast<std::uint32_t>(params.Moduli(coeff_mod));
+    std::uint32_t two_times_modulus_small = 2 * modulus_small;
+
+    for (std::size_t mm = 0; mm < log_n; ++mm) {
+      std::size_t m = 1 << mm;
+      std::size_t t = n >> (mm + 1);
+
+      for (std::size_t i = 0; i < m; ++i) {
+        uint64_t w = forward_table[m + i];
+        uint64_t w_prime = forward_table_prime[m + i];
+
+        auto op = operand.subspan(i * (2 * t), 2 * t);
+
+        SPDLOG_DEBUG("Processing coeff_mod: {}, m: {}, i: {}", coeff_mod, m, i);
+
+        if (t < 4) {
+          for (std::size_t j = 0; j < t; ++j) {
+            uint32_t x = static_cast<uint32_t>(op[j]);
+            uint32_t y = static_cast<uint32_t>(op[t + j]);
+
+            std::uint32_t curr_x =
+                x - (two_times_modulus_small *
+                     static_cast<std::uint32_t>(x >= two_times_modulus_small));
+            std::uint64_t q_tmp = (static_cast<std::uint64_t>(y) *
+                                   static_cast<std::uint64_t>(w_prime)) >>
+                                  32;
+            std::uint64_t q_new =
+                w * static_cast<std::uint64_t>(y) -
+                q_tmp * static_cast<std::uint64_t>(modulus_small);
+
+            op[j] = curr_x + q_new;
+            op[t + j] =
+                curr_x +
+                (static_cast<uint64_t>(two_times_modulus_small) - q_new);
+          }
+        } else {
+          for (std::size_t j = 0; j < t; j += 4) {
+            if (j + 4 > t) break;  // Ensure we do not exceed bounds
+
+            __m256i* p_x = reinterpret_cast<__m256i*>(&op[j]);
+            __m256i* p_y = reinterpret_cast<__m256i*>(&op[j + t]);
+
+            __m256i x = _mm256_loadu_si256(p_x);
+            __m256i y = _mm256_loadu_si256(p_y);
+
+            __m256i cmp_val = _mm256_set1_epi64x(
+                static_cast<int64_t>(two_times_modulus_small));
+            // reuse this variable to reduce variable num
+            // gt_mask
+            __m256i tmp1 = _mm256_cmpgt_epi64(x, cmp_val);
+
+            // __m256i to_subtract = _mm256_and_si256(gt_mask_reused, cmp_val);
+            tmp1 = _mm256_and_si256(tmp1, cmp_val);
+            __m256i curr_x = _mm256_sub_epi64(x, tmp1);
+
+            // __m256i w_prime_vec =
+            //     _mm256_set1_epi64x(static_cast<int64_t>(w_prime));
+            tmp1 = _mm256_set1_epi64x(static_cast<int64_t>(w_prime));
+            // __m256i product = _mm256_mul_epu32(y, tmp1);
+            tmp1 = _mm256_mul_epu32(y, tmp1);
+            // __m256i q_val = _mm256_srli_epi64(tmp1, 32);
+            tmp1 = _mm256_srli_epi64(tmp1, 32);
+
+            // __m256i w_vec = _mm256_set1_epi64x(static_cast<int64_t>(w));
+            __m256i tmp2 = _mm256_set1_epi64x(static_cast<int64_t>(w));
+            // __m256i w_times_y = _mm256_mul_epu32(y, w_vec);
+            // __m256i w_times_y = _mm256_mul_epu32(y, tmp2);
+            tmp2 = _mm256_mul_epu32(y, tmp2);
+
+            __m256i modulus_small_vec =
+                _mm256_set1_epi64x(static_cast<int64_t>(modulus_small));
+            // __m256i q_scaled = _mm256_mul_epu32(q_val, modulus_small_vec);
+            __m256i q_scaled = _mm256_mul_epu32(tmp1, modulus_small_vec);
+            __m256i q_final = _mm256_sub_epi64(tmp2, q_scaled);
+
+            __m256i new_x = _mm256_add_epi64(curr_x, q_final);
+            __m256i q_final_inverted = _mm256_sub_epi64(cmp_val, q_final);
+            __m256i new_y = _mm256_add_epi64(curr_x, q_final_inverted);
+
+            _mm256_storeu_si256(p_x, new_x);
+            _mm256_storeu_si256(p_y, new_y);
+          }
+        }
+      }
+    }
+
+    for (std::size_t i = 0; i < n; i += 4) {
+      if (i + 4 > n) break;  // Ensure we do not exceed bounds
+      __m256i* p_x = reinterpret_cast<__m256i*>(&operand[i]);
+
+      __m256i cmp_val1 =
+          _mm256_set1_epi64x(static_cast<int64_t>(two_times_modulus_small));
+      __m256i x = _mm256_loadu_si256(p_x);
+      __m256i gt_mask = _mm256_cmpgt_epi64(x, cmp_val1);
+      __m256i to_subtract = _mm256_and_si256(gt_mask, cmp_val1);
+      x = _mm256_sub_epi64(x, to_subtract);
+
+      __m256i cmp_val2 =
+          _mm256_set1_epi64x(static_cast<int64_t>(modulus_small));
+      gt_mask = _mm256_cmpgt_epi64(x, cmp_val2);
+      to_subtract = _mm256_and_si256(gt_mask, cmp_val2);
+      x = _mm256_sub_epi64(x, to_subtract);
+      _mm256_storeu_si256(p_x, x);
+    }
+  }
+}
+#endif
+
+#ifndef __AVX2__
+void NttInverse(const Params& params, absl::Span<uint64_t> operand_overall) {
+  for (std::size_t coeff_mod = 0; coeff_mod < params.CrtCount(); ++coeff_mod) {
+    std::size_t n = params.PolyLen();
+    auto operand = operand_overall.subspan(coeff_mod * n, n);
+
+    const auto& inverse_table = params.GetNttInverseTable(coeff_mod);
+    const auto& inverse_table_prime = params.GetNttInversePrimeTable(coeff_mod);
+    std::uint64_t modulus = params.Moduli(coeff_mod);
+    std::uint64_t two_times_modulus = 2 * modulus;
+
+    for (std::size_t mm = params.PolyLenLog2(); mm-- > 0;) {
+      std::size_t h = 1 << mm;
+      std::size_t t = n >> (mm + 1);
+
+      for (std::size_t i = 0; i < h; ++i) {
+        uint64_t w = inverse_table[h + i];
+        uint64_t w_prime = inverse_table_prime[h + i];
+
+        auto op = operand.subspan(i * 2 * t, 2 * t);
+
+        for (size_t j = 0; j < t; ++j) {
+          uint64_t x = op[j];
+          uint64_t y = op[t + j];
+
+          uint64_t t_tmp = two_times_modulus - y + x;
+          uint64_t curr_x =
+              x + y -
+              (two_times_modulus * static_cast<uint64_t>((x << 1) >= t_tmp));
+          uint64_t h_tmp = (t_tmp * w_prime) >> 32;
+
+          uint64_t res_x = (curr_x + (modulus * (t_tmp & 1))) >> 1;
+          uint64_t res_y = w * t_tmp - h_tmp * modulus;
+
+          op[j] = res_x;
+          op[t + j] = res_y;
+        }
+      }
+    }
+
+    for (size_t i = 0; i < n; ++i) {
+      operand[i] -= static_cast<uint64_t>(operand[i] >= two_times_modulus) *
+                    two_times_modulus;
+      operand[i] -= static_cast<uint64_t>(operand[i] >= modulus) * modulus;
+    }
+  }
+}
+
+#else
+
+void NttInverse(const Params& params, absl::Span<uint64_t> operand_overall) {
+  SPDLOG_DEBUG("use AVX2 NttInverse");
+
+  for (size_t coeff_mod = 0; coeff_mod < params.CrtCount(); ++coeff_mod) {
+    size_t n = params.PolyLen();
+    auto operand = operand_overall.subspan(coeff_mod * n, n);
+
+    const auto& inverse_table = params.GetNttInverseTable(coeff_mod);
+    const auto& inverse_table_prime = params.GetNttInversePrimeTable(coeff_mod);
+    uint64_t modulus = params.Moduli(coeff_mod);
+    uint64_t two_times_modulus = 2 * modulus;
+
+    for (size_t mm = params.PolyLenLog2(); mm-- > 0;) {
+      size_t h = 1 << mm;
+      size_t t = n >> (mm + 1);
+
+      for (size_t i = 0; i < h; ++i) {
+        uint64_t w = inverse_table[h + i];
+        uint64_t w_prime = inverse_table_prime[h + i];
+
+        auto op = operand.subspan(i * 2 * t, 2 * t);  // 获取当前的操作段
+        if (op.size() < 2 * t) {
+          throw std::runtime_error("Operation span is too small.");
+        }
+
+        if (t < 4) {
+          for (size_t j = 0; j < t; ++j) {
+            uint64_t x = op[j];
+            uint64_t y = op[t + j];
+
+            uint64_t t_tmp = two_times_modulus - y + x;
+            uint64_t curr_x = x + y - (two_times_modulus * ((x << 1) >= t_tmp));
+            uint64_t h_tmp = (t_tmp * w_prime) >> 32;
+
+            uint64_t res_x = (curr_x + (modulus * (t_tmp & 1))) >> 1;
+            uint64_t res_y = w * t_tmp - h_tmp * modulus;
+
+            op[j] = res_x;
+            op[t + j] = res_y;
+          }
+        } else {
+          for (size_t j = 0; j < t; j += 4) {
+            __m256i x =
+                _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&op[j]));
+            __m256i y = _mm256_loadu_si256(
+                reinterpret_cast<const __m256i*>(&op[j + t]));
+
+            __m256i modulus_vec =
+                _mm256_set1_epi64x(static_cast<int64_t>(modulus));
+            __m256i two_times_modulus_vec =
+                _mm256_set1_epi64x(static_cast<int64_t>(two_times_modulus));
+            __m256i t_tmp = _mm256_sub_epi64(two_times_modulus_vec, y);
+            t_tmp = _mm256_add_epi64(t_tmp, x);
+
+            // __m256i gt_mask =
+            //     _mm256_cmpgt_epi64(_mm256_slli_epi64(x, 1), t_tmp);
+            __m256i tmp1 = _mm256_cmpgt_epi64(_mm256_slli_epi64(x, 1), t_tmp);
+            // __m256i to_subtract =
+            //     _mm256_and_si256(tmp1, two_times_modulus_vec);
+
+            tmp1 = _mm256_and_si256(tmp1, two_times_modulus_vec);
+
+            __m256i curr_x = _mm256_add_epi64(x, y);
+            curr_x = _mm256_sub_epi64(curr_x, tmp1);
+
+            // __m256i w_prime_vec =
+            //     _mm256_set1_epi64x(static_cast<int64_t>(w_prime));
+            tmp1 = _mm256_set1_epi64x(static_cast<int64_t>(w_prime));
+            __m256i h_tmp = _mm256_mul_epu32(t_tmp, tmp1);
+            h_tmp = _mm256_srli_epi64(h_tmp, 32);
+
+            // __m256i and_mask = _mm256_set1_epi64x(1);
+            tmp1 = _mm256_set1_epi64x(1);
+            __m256i eq_mask =
+                _mm256_cmpeq_epi64(_mm256_and_si256(t_tmp, tmp1), tmp1);
+            // __m256i to_add = _mm256_and_si256(eq_mask, modulus_vec);
+            tmp1 = _mm256_and_si256(eq_mask, modulus_vec);
+
+            // __m256i new_x =
+            //     _mm256_srli_epi64(_mm256_add_epi64(curr_x, tmp1), 1);
+            tmp1 = _mm256_srli_epi64(_mm256_add_epi64(curr_x, tmp1), 1);
+
+            __m256i w_vec = _mm256_set1_epi64x(static_cast<int64_t>(w));
+            __m256i w_times_t_tmp = _mm256_mul_epu32(t_tmp, w_vec);
+            __m256i h_tmp_times_modulus = _mm256_mul_epu32(h_tmp, modulus_vec);
+            __m256i new_y =
+                _mm256_sub_epi64(w_times_t_tmp, h_tmp_times_modulus);
+
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(&op[j]), tmp1);
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(&op[j + t]), new_y);
+          }
+        }
+      }
+    }
+
+    for (size_t i = 0; i < n; ++i) {
+      operand[i] -= static_cast<uint64_t>(operand[i] >= two_times_modulus) *
+                    two_times_modulus;
+      operand[i] -= static_cast<uint64_t>(operand[i] >= modulus) * modulus;
+    }
+  }
+}
+
+#endif
+
+}  // namespace psi::spiral::arith
diff --git a/psi/legacy/kmprt17_mp_psi.cc b/psi/algorithm/spiral/arith/ntt.h
similarity index 52%
rename from psi/legacy/kmprt17_mp_psi.cc
rename to psi/algorithm/spiral/arith/ntt.h
index f69f656..65109b6 100644
--- a/psi/legacy/kmprt17_mp_psi.cc
+++ b/psi/algorithm/spiral/arith/ntt.h
@@ -1,4 +1,4 @@
-// Copyright 2024 zhangwfjh
+// Copyright 2024 Ant Group Co., Ltd.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -11,26 +11,18 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#pragma once
 
-#include "psi/legacy/kmprt17_mp_psi.h"
+#include <cassert>
+#include <cstdint>
 
-#include <memory>
+#include "absl/types/span.h"
 
-#include "psi/legacy/factory.h"
+#include "psi/algorithm/spiral/params.h"
 
-namespace psi::psi {
+namespace psi::spiral::arith {
 
-namespace {
+void NttForward(const Params& params, absl::Span<uint64_t> operand_overall);
+void NttInverse(const Params& params, absl::Span<uint64_t> operand_overall);
 
-std::unique_ptr<PsiBaseOperator> CreateOperator(
-    const MemoryPsiConfig& config,
-    const std::shared_ptr<yacl::link::Context>& lctx) {
-  return std::make_unique<KmprtMpPsiOperator>(
-      KmprtMpPsiOperator::Options({lctx, config.receiver_rank()}));
-}
-
-REGISTER_OPERATOR(KMPRT_PSI_NPC, CreateOperator);
-
-}  // namespace
-
-}  // namespace psi::psi
+}  // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/arith/ntt_table.cc b/psi/algorithm/spiral/arith/ntt_table.cc
new file mode 100644
index 0000000..eed9346
--- /dev/null
+++ b/psi/algorithm/spiral/arith/ntt_table.cc
@@ -0,0 +1,161 @@
+
+
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/arith/ntt_table.h"
+
+#include <utility>
+
+#include "seal/modulus.h"
+#include "yacl/base/exception.h"
+
+#include "psi/algorithm/spiral/arith/number_theory.h"
+
+namespace psi::spiral::arith {
+
+std::vector<std::uint64_t> ScalePowersU32(
+    std::uint32_t modulus, std::size_t poly_len,
+    const std::vector<std::uint64_t>& in) {
+  std::vector<std::uint64_t> scaled_powers(poly_len, 0ULL);
+
+  for (std::size_t i = 0; i < poly_len; ++i) {
+    std::uint64_t wide_val = in[i] << 32;
+    std::uint64_t quotient = wide_val / static_cast<std::uint64_t>(modulus);
+    scaled_powers[i] =
+        static_cast<std::uint64_t>((static_cast<std::uint32_t>(quotient)));
+  }
+
+  return scaled_powers;
+}
+
+std::vector<std::uint64_t> PowersOfPrimitiveRoot(std::uint64_t root,
+                                                 std::uint64_t modulus,
+                                                 std::size_t poly_len_log2) {
+  std::size_t poly_len = 1 << poly_len_log2;
+  std::vector<std::uint64_t> root_powers(poly_len, 0ULL);
+  std::uint64_t power = root;
+
+  root_powers[0] = 1;
+  seal::Modulus mod(modulus);
+  for (std::size_t i = 1; i < poly_len; ++i) {
+    auto idx = arith::ReverseBits(i, poly_len_log2);
+    root_powers[idx] = power;
+    power = arith::MultiplyUintMod(power, root, mod);
+  }
+  return root_powers;
+}
+
+std::vector<std::uint64_t> PowersOfPrimitiveRoot(std::uint64_t root,
+                                                 const seal::Modulus& mod,
+                                                 std::size_t poly_len_log2) {
+  std::size_t poly_len = 1 << poly_len_log2;
+  std::vector<std::uint64_t> root_powers(poly_len, 0ULL);
+  std::uint64_t power = root;
+
+  root_powers[0] = 1;
+  for (std::size_t i = 1; i < poly_len; ++i) {
+    auto idx = arith::ReverseBits(i, poly_len_log2);
+    root_powers[idx] = power;
+    power = arith::MultiplyUintMod(power, root, mod);
+  }
+  return root_powers;
+}
+
+NttTables BuildNttTables(std::size_t poly_len,
+                         const std::vector<std::uint64_t>& moduli) {
+  YACL_ENFORCE(poly_len > 0);
+  YACL_ENFORCE(moduli.size() > 0);
+
+  std::size_t poly_len_log2 = arith::Log2(poly_len);
+
+  NttTables tables;
+
+  for (std::size_t i = 0; i < moduli.size(); ++i) {
+    std::uint64_t modulus = moduli[i];
+    seal::Modulus mod(modulus);
+    // todo: why need convert? maybe reduce error?
+    auto modulus_u32 = static_cast<std::uint32_t>(modulus);
+
+    std::uint64_t root = arith::GetMinimalPrimitiveRoot(2 * poly_len, mod);
+    std::uint64_t inv_root = arith::InvertUintMod(root, mod);
+
+    auto root_powers = PowersOfPrimitiveRoot(root, mod, poly_len_log2);
+
+    auto scaled_root_powers =
+        ScalePowersU32(modulus_u32, poly_len, root_powers);
+
+    auto inv_root_power = PowersOfPrimitiveRoot(inv_root, mod, poly_len_log2);
+
+    for (std::size_t j = 0; j < poly_len; ++j) {
+      inv_root_power[j] = arith::Div2UintMod(inv_root_power[j], mod);
+    }
+
+    auto scaled_inv_root_powers =
+        ScalePowersU32(modulus_u32, poly_len, inv_root_power);
+
+    std::vector<std::vector<std::uint64_t>> temp{
+        std::move(root_powers), std::move(scaled_root_powers),
+        std::move(inv_root_power), std::move(scaled_inv_root_powers)};
+
+    tables.emplace_back(std::move(temp));
+  }
+
+  return tables;
+}
+
+NttTables BuildNttTables(std::size_t poly_len,
+                         const std::vector<seal::Modulus>& moduli) {
+  YACL_ENFORCE(poly_len > 0);
+  YACL_ENFORCE(moduli.size() > 0);
+
+  std::size_t poly_len_log2 = arith::Log2(poly_len);
+
+  NttTables tables;
+
+  for (std::size_t i = 0; i < moduli.size(); ++i) {
+    std::uint64_t modulus = moduli[i].value();
+    // todo: why need convert? maybe reduce error?
+    auto modulus_u32 = static_cast<std::uint32_t>(modulus);
+
+    std::uint64_t root =
+        arith::GetMinimalPrimitiveRoot(2 * poly_len, moduli[i]);
+    std::uint64_t inv_root = arith::InvertUintMod(root, moduli[i]);
+
+    auto root_powers = PowersOfPrimitiveRoot(root, moduli[i], poly_len_log2);
+
+    auto scaled_root_powers =
+        ScalePowersU32(modulus_u32, poly_len, root_powers);
+
+    auto inv_root_power =
+        PowersOfPrimitiveRoot(inv_root, moduli[i], poly_len_log2);
+
+    for (std::size_t j = 0; j < poly_len; ++j) {
+      inv_root_power[j] = arith::Div2UintMod(inv_root_power[j], moduli[i]);
+    }
+
+    auto scaled_inv_root_powers =
+        ScalePowersU32(modulus_u32, poly_len, inv_root_power);
+
+    std::vector<std::vector<std::uint64_t>> temp{
+        std::move(root_powers), std::move(scaled_root_powers),
+        std::move(inv_root_power), std::move(scaled_inv_root_powers)};
+
+    tables.emplace_back(std::move(temp));
+  }
+
+  return tables;
+}
+
+}  // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/arith/ntt_table.h b/psi/algorithm/spiral/arith/ntt_table.h
new file mode 100644
index 0000000..8804e09
--- /dev/null
+++ b/psi/algorithm/spiral/arith/ntt_table.h
@@ -0,0 +1,40 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+#include "seal/modulus.h"
+
+namespace psi::spiral::arith {
+
+using NttTables = std::vector<std::vector<std::vector<std::uint64_t>>>;
+
+NttTables BuildNttTables(std::size_t poly_len,
+                         const std::vector<std::uint64_t>& moduli);
+
+NttTables BuildNttTables(std::size_t poly_len,
+                         const std::vector<seal::Modulus>& moduli);
+
+std::vector<std::uint64_t> ScalePowersU32(std::uint32_t modulus,
+                                          std::size_t poly_len,
+                                          const std::vector<std::uint64_t>& in);
+
+std::vector<std::uint64_t> PowersOfPrimitiveRoot(std::uint64_t root,
+                                                 std::uint64_t modulus,
+                                                 std::size_t poly_len_log2);
+
+}  // namespace psi::spiral::arith
\ No newline at end of file
diff --git a/psi/algorithm/spiral/arith/ntt_table_test.cc b/psi/algorithm/spiral/arith/ntt_table_test.cc
new file mode 100644
index 0000000..05fea79
--- /dev/null
+++ b/psi/algorithm/spiral/arith/ntt_table_test.cc
@@ -0,0 +1,127 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/arith/ntt_table.h"
+
+#include <chrono>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "yacl/base/aligned_vector.h"
+
+#include "psi/algorithm/spiral/arith/ntt.h"
+#include "psi/algorithm/spiral/params.h"
+#include "psi/algorithm/spiral/util.h"
+
+namespace psi::spiral::arith {
+
+namespace {
+
+constexpr std::uint64_t kRefVal{519370102};
+
+constexpr std::size_t kMaxLoop{100};
+
+}  // namespace
+
+TEST(NttTest, BuildNttTables) {
+  std::vector<std::uint64_t> moduli{268369921ULL, 249561089ULL};
+  std::size_t poly_len{2048};
+
+  NttTables res = arith::BuildNttTables(poly_len, moduli);
+
+  ASSERT_EQ(2, res.size());
+  ASSERT_EQ(4, res[0].size());
+  ASSERT_EQ(poly_len, res[0][0].size());
+
+  ASSERT_EQ(134184961, res[0][2][0]);
+  ASSERT_EQ(96647580, res[0][2][1]);
+
+  std::uint64_t x1 = 0;
+  for (std::size_t i = 0; i < res.size(); ++i) {
+    for (std::size_t j = 0; j < res[0].size(); ++j) {
+      for (std::size_t k = 0; k < res[0][0].size(); ++k) {
+        x1 ^= res[i][j][k];
+      }
+    }
+  }
+  ASSERT_EQ(kRefVal, x1);
+}
+
+TEST(NttTest, NttForward) {
+  auto params = util::GetFastExpansionTestingParam();
+
+  std::vector<std::uint64_t> v1(2 * 2048, 0);
+  v1[0] = 100;
+  v1[2048] = 100;
+
+  arith::NttForward(params, absl::MakeSpan(v1));
+  ASSERT_EQ(v1[50], 100);
+  ASSERT_EQ(v1[2048 + 50], 100);
+}
+
+TEST(NttTest, NttInverse) {
+  auto params = util::GetFastExpansionTestingParam();
+
+  std::vector<std::uint64_t> v1(2 * 2048, 100);
+  arith::NttInverse(params, absl::MakeSpan(v1));
+  ASSERT_EQ(v1[0], 100);
+  ASSERT_EQ(v1[2048], 100);
+  ASSERT_EQ(v1[50], 0);
+  ASSERT_EQ(v1[2048 + 50], 0);
+}
+
+TEST(NttTest, NttCorrect) {
+  auto params = util::GetFastExpansionTestingParam();
+
+  std::vector<uint64_t> v1(params.CrtCount() * params.PolyLen());
+  std::random_device rd;
+  std::mt19937_64 prg(rd());
+
+  uint64_t total_time = 0;
+
+  for (size_t l = 0; l < kMaxLoop; ++l) {
+    for (size_t i = 0; i < params.CrtCount(); ++i) {
+      for (size_t j = 0; j < params.PolyLen(); ++j) {
+        std::vector<std::size_t> indices{i, j};
+        std::vector<size_t> lengths{params.CrtCount(), params.PolyLen()};
+        auto idx = util::CalcIndex(indices, lengths);
+        uint64_t val = prg();
+        v1[idx] = val % params.Moduli(i);
+      }
+    }
+    // copy
+    std::vector<uint64_t> v2(v1.begin(), v1.end());
+
+    auto start = std::chrono::high_resolution_clock::now();
+    // forward
+    arith::NttForward(params, absl::MakeSpan(v2));
+    // inverse
+    arith::NttInverse(params, absl::MakeSpan(v2));
+    auto end = std::chrono::high_resolution_clock::now();
+
+    auto duration =
+        std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+    total_time += duration.count();
+
+    ASSERT_EQ(v2, v1);
+  }
+
+  SPDLOG_INFO("{} Ntts, total time: {} micro-sec, each Ntt, time: {}", kMaxLoop,
+              total_time, static_cast<double>(total_time) / kMaxLoop);
+}
+
+}  // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/arith/number_theory.h b/psi/algorithm/spiral/arith/number_theory.h
new file mode 100644
index 0000000..747d8d6
--- /dev/null
+++ b/psi/algorithm/spiral/arith/number_theory.h
@@ -0,0 +1,94 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "seal/seal.h"
+#include "seal/util/numth.h"
+#include "yacl/base/exception.h"
+
+#include "psi/algorithm/spiral/arith/arith.h"
+
+namespace psi::spiral::arith {
+
+inline bool IsPrimitiveRoot(std::uint64_t root, std::uint64_t degree,
+                            std::uint64_t modulus) {
+  if (root == 0) {
+    return false;
+  }
+  return arith::ExponentitateUintMod(root, degree >> 1, modulus) == modulus - 1;
+}
+
+inline std::uint64_t GetPrimitiveRoot(std::uint64_t degree,
+                                      std::uint64_t modulus) {
+  YACL_ENFORCE(modulus > 1);
+  YACL_ENFORCE(degree >= 2);
+
+  std::uint64_t result = 0ULL;
+  // todo: consider remove seal Modulus usage
+  seal::Modulus mod(modulus);
+  // return must be true
+  YACL_ENFORCE(seal::util::try_primitive_root(degree, mod, result),
+               "{} mod {} primitive root do not exits", degree, modulus);
+
+  return result;
+}
+
+inline std::uint64_t GetPrimitiveRoot(std::uint64_t degree,
+                                      const seal::Modulus& mod) {
+  YACL_ENFORCE(mod.value() > 1);
+  YACL_ENFORCE(degree >= 2);
+
+  std::uint64_t result = 0ULL;
+  // return must be true
+  YACL_ENFORCE(seal::util::try_primitive_root(degree, mod, result),
+               "{} mod {} primitive root do not exits", degree, mod.value());
+
+  return result;
+}
+
+inline std::uint64_t GetMinimalPrimitiveRoot(std::uint64_t degree,
+                                             std::uint64_t modulus) {
+  std::uint64_t result{0};
+  // todo: consider remove seal Modulus usage
+  seal::Modulus mod(modulus);
+  // return must be true
+  YACL_ENFORCE(seal::util::try_minimal_primitive_root(degree, mod, result));
+  return result;
+}
+
+inline std::uint64_t GetMinimalPrimitiveRoot(std::uint64_t degree,
+                                             const seal::Modulus& mod) {
+  std::uint64_t result{0};
+  // return must be true
+  YACL_ENFORCE(seal::util::try_minimal_primitive_root(degree, mod, result));
+  return result;
+}
+
+inline std::uint64_t InvertUintMod(std::uint64_t value, std::uint64_t modulus) {
+  YACL_ENFORCE(value > 0);
+  seal::Modulus mod(modulus);
+  std::uint64_t result{0};
+  YACL_ENFORCE(seal::util::try_invert_uint_mod(value, mod, result));
+  return result;
+}
+
+inline std::uint64_t InvertUintMod(std::uint64_t value,
+                                   const seal::Modulus& mod) {
+  YACL_ENFORCE(value > 0);
+  std::uint64_t result{0};
+  YACL_ENFORCE(seal::util::try_invert_uint_mod(value, mod, result));
+  return result;
+}
+
+}  // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/arith/number_theory_test.cc b/psi/algorithm/spiral/arith/number_theory_test.cc
new file mode 100644
index 0000000..fafaf78
--- /dev/null
+++ b/psi/algorithm/spiral/arith/number_theory_test.cc
@@ -0,0 +1,80 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/arith/number_theory.h"
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "yacl/base/exception.h"
+
+namespace psi::spiral::arith {
+
+TEST(NumberTheoryTest, IsPrimitiveRoot) {
+  std::uint64_t modulus{11};
+  ASSERT_TRUE(arith::IsPrimitiveRoot(10, 2, modulus));
+  ASSERT_FALSE(arith::IsPrimitiveRoot(9, 2, modulus));
+  ASSERT_FALSE(arith::IsPrimitiveRoot(10, 4, modulus));
+}
+
+TEST(NumberTheoryTest, GetPrimitiveRoot) {
+  std::uint64_t modulus{11};
+
+  ASSERT_EQ(10, arith::GetPrimitiveRoot(2, modulus));
+  // primitive root do not exist
+  ASSERT_THROW(arith::GetPrimitiveRoot(3, modulus), yacl::EnforceNotMet);
+
+  modulus = 29;
+  ASSERT_EQ(28, arith::GetPrimitiveRoot(2, modulus));
+
+  std::vector<std::uint64_t> corrects{12, 17};
+  ASSERT_TRUE(std::find(corrects.begin(), corrects.end(),
+                        arith::GetPrimitiveRoot(4, modulus)) != corrects.end());
+}
+
+TEST(NumberTheoryTest, GetMinimalPrimitiveRoot) {
+  std::uint64_t modulus{11};
+  ASSERT_EQ(10, arith::GetMinimalPrimitiveRoot(2, modulus));
+
+  modulus = 29;
+  ASSERT_EQ(28, arith::GetMinimalPrimitiveRoot(2, modulus));
+  ASSERT_EQ(12, arith::GetMinimalPrimitiveRoot(4, modulus));
+
+  modulus = 1234565441;
+  ASSERT_EQ(1234565440ULL, arith::GetMinimalPrimitiveRoot(2, modulus));
+  ASSERT_EQ(249725733ULL, arith::GetMinimalPrimitiveRoot(8, modulus));
+}
+
+TEST(NumberTheoryTest, InvertUintMod) {
+  std::uint64_t modulus;
+  std::uint64_t input;
+
+  input = 1;
+  modulus = 2;
+  ASSERT_EQ(1, arith::InvertUintMod(input, modulus));
+
+  input = 2;
+  modulus = 2;
+  ASSERT_THROW(arith::InvertUintMod(input, modulus), yacl::EnforceNotMet);
+
+  input = 3;
+  modulus = 2;
+  ASSERT_EQ(1, arith::InvertUintMod(input, modulus));
+
+  input = 0xFFFFFF;
+  modulus = 2;
+  ASSERT_EQ(1, arith::InvertUintMod(input, modulus));
+}
+
+}  // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/common.h b/psi/algorithm/spiral/common.h
new file mode 100644
index 0000000..4a217fa
--- /dev/null
+++ b/psi/algorithm/spiral/common.h
@@ -0,0 +1,139 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "spdlog/spdlog.h"
+
+namespace psi::spiral {
+
+template <typename T>
+void PrintVector(std::vector<T>& data) {
+  std::ostringstream ss;
+  ss << "[";
+  for (size_t i = 0; i < data.size(); ++i) {
+    ss << data[i];
+    ss << (i + 1 == data.size() ? "" : ", ");
+  }
+  ss << "]\n";
+
+  SPDLOG_INFO("{}", ss.str());
+}
+
+template <typename T>
+void PrintVector(absl::Span<T> data) {
+  std::ostringstream ss;
+  ss << "[";
+  for (size_t i = 0; i < data.size(); ++i) {
+    ss << data[i];
+    ss << (i + 1 == data.size() ? "" : ", ");
+  }
+  ss << "]\n";
+
+  SPDLOG_INFO("{}", ss.str());
+}
+
+template <typename T>
+void PrintVector(std::string_view name, std::vector<T>& data) {
+  std::ostringstream ss;
+
+  ss << name << ": \n"
+     << "[";
+  for (size_t i = 0; i < data.size(); ++i) {
+    ss << data[i];
+    ss << (i + 1 == data.size() ? "" : ", ");
+  }
+  ss << "]\n";
+  SPDLOG_INFO("{}", ss.str());
+}
+
+template <typename T>
+void PrintVector(std::string_view name, absl::Span<T> data) {
+  std::ostringstream ss;
+
+  ss << name << ": \n"
+     << "[";
+  for (size_t i = 0; i < data.size(); ++i) {
+    ss << data[i];
+    ss << (i + 1 == data.size() ? "" : ", ");
+  }
+  ss << "]\n";
+  SPDLOG_INFO("{}", ss.str());
+}
+
+constexpr std::size_t kMaxModuli{4};
+
+constexpr std::size_t kMinQ2Bits{14};
+
+constexpr std::size_t kHammingWeight{256};
+
+constexpr std::size_t kSeedLength{32};
+
+constexpr std::uint64_t kPackedOffset2{32};
+
+// the range of v1 and v2
+// ref: in Spiral paper Page-26: we consider all database configurations v1 v2
+// \in [2, 11]
+constexpr std::size_t kMaxDbDim{11};
+
+// map q2Bits to q2 value
+constexpr std::array<uint64_t, 37> kQ2Values{0,
+                                             0,
+                                             0,
+                                             0,
+                                             0,
+                                             0,
+                                             0,
+                                             0,
+                                             0,
+                                             0,
+                                             0,
+                                             0,
+                                             0,
+                                             0,
+                                             12289,
+                                             12289,
+                                             61441,
+                                             65537,
+                                             65537,
+                                             520193,
+                                             786433,
+                                             786433,
+                                             3604481,
+                                             7340033,
+                                             16515073,
+                                             33292289,
+                                             67043329,
+                                             132120577,
+                                             268369921,
+                                             469762049,
+                                             1073479681,
+                                             2013265921,
+                                             4293918721ULL,
+                                             8588886017ULL,
+                                             17175674881ULL,
+                                             34359214081ULL,
+                                             68718428161ULL};
+
+}  // namespace psi::spiral
\ No newline at end of file
diff --git a/psi/algorithm/spiral/discrete_gaussian.cc b/psi/algorithm/spiral/discrete_gaussian.cc
new file mode 100644
index 0000000..1a2ced9
--- /dev/null
+++ b/psi/algorithm/spiral/discrete_gaussian.cc
@@ -0,0 +1,85 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/discrete_gaussian.h"
+
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include "yacl/crypto/tools/prg.h"
+
+namespace psi::spiral {
+
+DiscreteGaussian::DiscreteGaussian(double noise_width) {
+  max_val_ = static_cast<int64_t>(
+      std::ceil((noise_width * static_cast<double>(kNumWidths))));
+  std::vector<double> table;
+  table.reserve(2 * max_val_ + 1);
+  double total = 0.0;
+  for (int64_t i = -max_val_; i <= max_val_; ++i) {
+    double p_val = std::exp(-PI * std::pow(static_cast<double>(i), 2) /
+                            std::pow(noise_width, 2));
+    table.push_back(p_val);
+    total += p_val;
+  }
+  double cum_prob = 0.0;
+  for (auto p : table) {
+    cum_prob += p / total;
+    double cum_prob_double = std::round(
+        cum_prob * static_cast<double>(std::numeric_limits<uint64_t>::max()));
+    auto cum_prob_u64 = static_cast<uint64_t>(cum_prob_double);
+    // avoid overflow lead to UB
+    if (cum_prob_double >=
+        static_cast<double>(std::numeric_limits<uint64_t>::max())) {
+      cum_prob_u64 = std::numeric_limits<uint64_t>::max();
+    }
+    cdf_table_.push_back(cum_prob_u64);
+  }
+}
+
+uint64_t DiscreteGaussian::Sample(uint64_t modulus,
+                                  yacl::crypto::Prg<uint64_t>& prg) const {
+  uint64_t sampled_val = prg();
+  size_t len = (2 * static_cast<size_t>(max_val_) + 1);
+  uint64_t output = 0;
+
+  for (int i = len - 1; i >= 0; i--) {
+    int64_t out_val = static_cast<int64_t>(i) - max_val_;
+    if (out_val < 0) {
+      out_val += (static_cast<int64_t>(modulus));
+    }
+    auto out_val_u64 = static_cast<uint64_t>(out_val);
+    uint64_t point = cdf_table_[i];
+    if (sampled_val <= point) {
+      output = out_val_u64;
+    }
+  }
+  return output;
+}
+void DiscreteGaussian::SampleMatrix(const Params& params, PolyMatrixRaw& p,
+                                    yacl::crypto::Prg<uint64_t>& prg) const {
+  auto modulus = params.Modulus();
+
+  for (size_t r = 0; r < p.Rows(); ++r) {
+    for (size_t c = 0; c < p.Cols(); ++c) {
+      auto poly = p.Poly(r, c);
+      for (size_t i = 0; i < poly.size(); ++i) {
+        poly[i] = this->Sample(modulus, prg);
+      }
+    }
+  }
+}
+
+}  // namespace psi::spiral
diff --git a/psi/algorithm/spiral/discrete_gaussian.h b/psi/algorithm/spiral/discrete_gaussian.h
new file mode 100644
index 0000000..d4e64af
--- /dev/null
+++ b/psi/algorithm/spiral/discrete_gaussian.h
@@ -0,0 +1,44 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include "yacl/crypto/tools/prg.h"
+
+#include "psi/algorithm/spiral/poly_matrix.h"
+
+namespace psi::spiral {
+
+constexpr uint64_t kNumWidths{4};
+
+constexpr double PI{3.141592653589793};
+
+struct DiscreteGaussian {
+  std::vector<uint64_t> cdf_table_;
+  int64_t max_val_ = 0;
+
+  DiscreteGaussian() = default;
+  explicit DiscreteGaussian(double noise_width);
+
+  uint64_t Sample(uint64_t modulus, yacl::crypto::Prg<uint64_t>& prg) const;
+
+  void SampleMatrix(const Params& params, PolyMatrixRaw& p,
+                    yacl::crypto::Prg<uint64_t>& prg) const;
+};
+
+}  // namespace psi::spiral
diff --git a/psi/algorithm/spiral/discrete_gaussian_test.cc b/psi/algorithm/spiral/discrete_gaussian_test.cc
new file mode 100644
index 0000000..f8c3747
--- /dev/null
+++ b/psi/algorithm/spiral/discrete_gaussian_test.cc
@@ -0,0 +1,129 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/discrete_gaussian.h"
+
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "yacl/crypto/rand/rand.h"
+#include "yacl/crypto/tools/prg.h"
+
+#include "psi/algorithm/spiral/util.h"
+
+namespace psi::spiral {
+namespace {
+
+const std::vector<uint64_t> kCdfTable{0,
+                                      0,
+                                      0,
+                                      7,
+                                      225,
+                                      6114,
+                                      142809,
+                                      2864512,
+                                      49349166,
+                                      730367088,
+                                      9288667698,
+                                      101545086850,
+                                      954617134063,
+                                      7720973857474,
+                                      53757667977838,
+                                      322436486442815,
+                                      1667499996257363,
+                                      7443566871362058,
+                                      28720140744863912,
+                                      95948302954529184,
+                                      278161926109627936,
+                                      701795634139702528,
+                                      1546646853635105024,
+                                      2991920295851131904,
+                                      5112721055115152384,
+                                      7782220156096217088,
+                                      10664523917613334528ULL,
+                                      13334023018594400256ULL,
+                                      15454823777858420736ULL,
+                                      16900097220074446848ULL,
+                                      17744948439569850368ULL,
+                                      18168582147599925248ULL,
+                                      18350795770755024896ULL,
+                                      18418023932964689920ULL,
+                                      18439300506838192128ULL,
+                                      18445076573713297408ULL,
+                                      18446421637223112704ULL,
+                                      18446690316041578496ULL,
+                                      18446736352735698944ULL,
+                                      18446743119092422656ULL,
+                                      18446743972164470784ULL,
+                                      18446744064420890624ULL,
+                                      18446744072979191808ULL,
+                                      18446744073660209152ULL,
+                                      18446744073706694656ULL,
+                                      18446744073709416448ULL,
+                                      18446744073709551615ULL,
+                                      18446744073709551615ULL,
+                                      18446744073709551615ULL,
+                                      18446744073709551615ULL,
+                                      18446744073709551615ULL,
+                                      18446744073709551615ULL,
+                                      18446744073709551615ULL};
+
+}
+
+TEST(DiscreteGaussian, CdfTable) {
+  DiscreteGaussian dg(6.4);
+  ASSERT_EQ(dg.cdf_table_, kCdfTable);
+}
+
+TEST(DiscreteGaussian, Correct) {
+  auto params = util::GetTestParam();
+  DiscreteGaussian dg(params.NoiseWidth());
+
+  yacl::crypto::Prg<uint64_t> prg(yacl::crypto::SecureRandU128());
+  std::vector<int64_t> v;
+  size_t trials = 10000;
+  int64_t sum = 0;
+
+  auto modulus = params.Modulus();
+  for (size_t i = 0; i < trials; ++i) {
+    auto val = dg.Sample(modulus, prg);
+    auto val_i64 = static_cast<int64_t>(val);
+    if (val_i64 >= (static_cast<int64_t>(modulus) / 2)) {
+      val_i64 -= static_cast<int64_t>(modulus);
+    }
+    v.push_back(val_i64);
+    sum += val_i64;
+  }
+
+  double computed_mean = static_cast<double>(sum) / trials;
+  double expected_std_dev = params.NoiseWidth() / std::sqrt(2.0 * PI);
+  double std_dev_of_mean =
+      expected_std_dev / std::sqrt(static_cast<double>(trials));
+
+  ASSERT_TRUE(std::abs(computed_mean) < std_dev_of_mean * 5.0);
+
+  double computed_variance = 0.0;
+  for (const auto& x : v) {
+    computed_variance += std::pow(computed_mean - static_cast<double>(x), 2);
+  }
+  computed_variance = computed_variance / static_cast<double>(v.size());
+  double computed_std_dev = std::sqrt(computed_variance);
+
+  ASSERT_TRUE(std::abs(computed_std_dev - expected_std_dev) <
+              expected_std_dev * 0.1);
+}
+
+}  // namespace psi::spiral
diff --git a/psi/algorithm/spiral/gadget.cc b/psi/algorithm/spiral/gadget.cc
new file mode 100644
index 0000000..dc43d26
--- /dev/null
+++ b/psi/algorithm/spiral/gadget.cc
@@ -0,0 +1,86 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/gadget.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#include "psi/algorithm/spiral/params.h"
+#include "psi/algorithm/spiral/poly_matrix.h"
+
+namespace psi::spiral::util {
+
+size_t GetBitsPer(const Params& params, size_t dim) {
+  auto modulus_log2 = params.ModulusLog2();
+  if (dim == modulus_log2) {
+    return 1;
+  }
+  return (modulus_log2 / dim) + 1;
+}
+
+PolyMatrixRaw BuildGadget(const Params& params, size_t rows, size_t cols) {
+  auto g = PolyMatrixRaw::Zero(params.PolyLen(), rows, cols);
+
+  auto nx = g.Rows();
+  auto m = g.Cols();
+  WEAK_ENFORCE(m % nx == 0);
+
+  size_t num_elements = m / nx;
+  size_t bits_per = GetBitsPer(params, num_elements);
+  for (size_t i = 0; i < nx; ++i) {
+    for (size_t j = 0; j < num_elements; ++j) {
+      if (bits_per * j >= 64) {
+        continue;
+      }
+      auto poly_idx = g.PolyStartIndex(i, i + j * nx);
+      g.Data()[poly_idx] = 1ULL << (bits_per * j);
+    }
+  }
+  return g;
+}
+
+void GadgetInvertRdim(const Params& params, PolyMatrixRaw& out,
+                      const PolyMatrixRaw& in, size_t rdim) {
+  WEAK_ENFORCE(out.Cols() == in.Cols());
+
+  size_t mx = out.Rows();
+  size_t num_elements = mx / rdim;
+  size_t bits_per = GetBitsPer(params, num_elements);
+
+  uint64_t mask = (static_cast<uint64_t>(1) << bits_per) - 1;
+  for (size_t i = 0; i < in.Cols(); ++i) {
+    for (size_t j = 0; j < rdim; ++j) {
+      auto poly = in.Poly(i, j);
+      for (size_t z = 0; z < params.PolyLen(); ++z) {
+        uint64_t val = poly[z];
+        for (size_t k = 0; k < num_elements; ++k) {
+          uint64_t bit_offs = std::min(static_cast<uint64_t>(k * bits_per),
+                                       static_cast<uint64_t>(64ULL));
+          uint64_t piece;
+          if (bit_offs >= 64) {
+            piece = 0;
+          } else {
+            piece = (val >> bit_offs) & mask;
+          }
+          // assign to out
+          out.Poly(j + k * rdim, i)[z] = piece;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace psi::spiral::util
diff --git a/psi/algorithm/spiral/gadget.h b/psi/algorithm/spiral/gadget.h
new file mode 100644
index 0000000..5cecbad
--- /dev/null
+++ b/psi/algorithm/spiral/gadget.h
@@ -0,0 +1,37 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include "psi/algorithm/spiral/params.h"
+#include "psi/algorithm/spiral/poly_matrix.h"
+
+namespace psi::spiral::util {
+
+size_t GetBitsPer(const Params& params, size_t dim);
+
+PolyMatrixRaw BuildGadget(const Params& params, size_t rows, size_t cols);
+
+void GadgetInvertRdim(const Params& params, PolyMatrixRaw& out,
+                      const PolyMatrixRaw& in, size_t r_dim);
+
+inline void GadgetInvert(const Params& params, PolyMatrixRaw& out,
+                         const PolyMatrixRaw& in) {
+  GadgetInvertRdim(params, out, in, in.Rows());
+}
+
+}  // namespace psi::spiral::util
diff --git a/psi/algorithm/spiral/gadget_test.cc b/psi/algorithm/spiral/gadget_test.cc
new file mode 100644
index 0000000..852edf9
--- /dev/null
+++ b/psi/algorithm/spiral/gadget_test.cc
@@ -0,0 +1,115 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/gadget.h"
+
+#include "gtest/gtest.h"
+
+#include "psi/algorithm/spiral/util.h"
+
+namespace psi::spiral::util {
+
+namespace {
+constexpr size_t kMaxLoop{100};
+}
+
+TEST(Gadget, BuildGadgetCorrect) {
+  auto params = util::GetTestParam();
+  auto shared_params = std::make_shared<Params>(params);
+
+  auto g = util::BuildGadget(params, 1, 4);
+
+  // ground truth comes from rust code
+  ASSERT_EQ(g.Poly(0, 0)[0], 1);
+  ASSERT_EQ(g.Poly(0, 1)[0], 32768);
+  ASSERT_EQ(g.Poly(0, 2)[0], 1073741824);
+  ASSERT_EQ(g.Poly(0, 3)[0], 35184372088832);
+
+  // other value is 0
+  for (size_t i = 0; i < g.Data().size(); ++i) {
+    if (i == g.PolyStartIndex(0, 0) || i == g.PolyStartIndex(0, 1) ||
+        i == g.PolyStartIndex(0, 2) || i == g.PolyStartIndex(0, 3)) {
+      continue;
+    }
+    ASSERT_EQ(g.Data()[i], 0);
+  }
+}
+
+TEST(UtilTest, BytesToCoeffs) {
+  size_t logt = 20;
+  size_t offset = 0;
+  size_t size = 256;
+
+  yacl::crypto::Prg<uint8_t> prg;
+  std::vector<uint8_t> bytes(size);
+
+  for (size_t i = 0; i < kMaxLoop; ++i) {
+    prg.Fill(absl::MakeSpan(bytes));
+    // convert to coeff
+    auto coeffs = ConvertBytesToCoeffs(logt, offset, size, bytes);
+    auto bytes_convert = ConvertCoeffsToBytes(coeffs, logt);
+    // the length may be different, so we use span
+    ASSERT_EQ(absl::MakeSpan(bytes),
+              absl::MakeSpan(bytes_convert.data(), bytes.size()));
+  }
+}
+
+TEST(UtilTest, ConvertBytesToCoeffs) {
+  size_t logt = 20;
+  size_t offset = 0;
+  size_t size = 256;
+
+  std::vector<uint8_t> bytes{
+      0x57, 0xAB, 0xF0, 0x6C, 0x8C, 0x83, 0x25, 0xC1, 0x3C, 0x18, 0xB1, 0x15,
+      0x25, 0xA5, 0xE8, 0x5E, 0xCD, 0xFB, 0x62, 0x94, 0xFB, 0x9E, 0x05, 0x72,
+      0x39, 0x06, 0xC2, 0x6A, 0x8C, 0xFC, 0x6A, 0x5B, 0x65, 0xE3, 0x28, 0x4D,
+      0x90, 0x6C, 0x4E, 0x31, 0x27, 0x8C, 0x57, 0xE0, 0x77, 0x86, 0x68, 0x62,
+      0x0E, 0x2E, 0xFF, 0x40, 0xB0, 0xCA, 0x24, 0xD7, 0xE7, 0xC9, 0xEA, 0x4F,
+      0xC5, 0x17, 0x3E, 0x4D, 0x20, 0xC1, 0x45, 0xD3, 0x44, 0xB9, 0x7B, 0xA9,
+      0x4E, 0x37, 0xBD, 0x03, 0x91, 0xE9, 0xC8, 0x60, 0x07, 0xA8, 0xA8, 0x8E,
+      0x52, 0x61, 0xA4, 0x67, 0xAD, 0xAD, 0xCA, 0x1D, 0xA2, 0x7A, 0x3E, 0x68,
+      0x08, 0xFF, 0x3E, 0x19, 0x26, 0xBD, 0x15, 0xFB, 0x6B, 0x85, 0xF2, 0xF2,
+      0x31, 0xA7, 0x9C, 0xC1, 0xA6, 0xEA, 0x06, 0x08, 0x4A, 0xDB, 0x8B, 0x4C,
+      0x9D, 0x47, 0xD7, 0x24, 0x52, 0xF5, 0x3B, 0xA4, 0x6B, 0xED, 0xB1, 0x0B,
+      0x6D, 0x7F, 0x19, 0xF7, 0x8E, 0x04, 0xBB, 0xD6, 0xD8, 0xE9, 0x81, 0xD1,
+      0x92, 0x4F, 0xDE, 0xD1, 0xB4, 0x84, 0x30, 0x05, 0xDB, 0x53, 0x9C, 0x17,
+      0xD4, 0xC2, 0x39, 0x37, 0xEC, 0xAC, 0x30, 0xD8, 0x6D, 0x56, 0x3E, 0x7B,
+      0x93, 0x50, 0xF3, 0xA0, 0xC6, 0xF2, 0x0E, 0xAD, 0xF0, 0xFA, 0x73, 0xD9,
+      0x3F, 0xE6, 0xB7, 0xBF, 0xC1, 0x41, 0x24, 0x24, 0x86, 0xD0, 0xE9, 0x61,
+      0x4D, 0x97, 0x50, 0x50, 0xEB, 0x1F, 0xBF, 0x06, 0x56, 0x82, 0xD1, 0xF8,
+      0x0D, 0x06, 0xC8, 0xE8, 0xA6, 0x12, 0x92, 0x28, 0x54, 0x47, 0x0B, 0x29,
+      0x20, 0xE0, 0x9B, 0x80, 0xBD, 0x54, 0x60, 0x14, 0x7C, 0x80, 0x25, 0x08,
+      0x01, 0x2C, 0x4A, 0x76, 0x13, 0xC3, 0x51, 0x0F, 0x67, 0x3E, 0x2F, 0xC1,
+      0x25, 0x20, 0x5E, 0x8E, 0xA3, 0x4D, 0xDB, 0x4A, 0x20, 0x94, 0xDA, 0xF3,
+      0x70, 0x30, 0xF0, 0x3D};
+
+  std::vector<uint64_t> expected{
+      359103,  27788,  537180,  80920,  725330,  370152, 388319,  746132,
+      1030624, 356921, 27686,   691452, 435638,  385832, 317702,  806449,
+      161989,  516215, 550534,  134702, 1045515, 51748,  884348,  649807,
+      807283,  937248, 791645,  214201, 506516,  931773, 14622,   641120,
+      31370,   560722, 399942,  503213, 827866,  162366, 426127,  998937,
+      158673,  392043, 548655,  143783, 642074,  453126, 33965,   756556,
+      644221,  468050, 1004474, 289773, 725174,  884505, 1013984, 310230,
+      888472,  119186, 327149,  111748, 196701,  742300, 97612,   145719,
+      969411,  55405,  353255,  758608, 997900,  455182, 712463,  685017,
+      261739,  507841, 266818,  296656, 955924,  890704, 331441,  1031942,
+      354349,  129037, 27790,   566802, 598661,  280331, 168462,  39808,
+      775494,  5244,   524880,  524588, 304993,  246609, 63091,   929729,
+      152069,  954019, 318900,  663700, 896823,  12528,  249856};
+  auto result = util::ConvertBytesToCoeffs(logt, offset, size, bytes);
+  ASSERT_EQ(expected, result);
+}
+
+}  // namespace psi::spiral::util
diff --git a/psi/algorithm/spiral/params.cc b/psi/algorithm/spiral/params.cc
new file mode 100644
index 0000000..ca5a997
--- /dev/null
+++ b/psi/algorithm/spiral/params.cc
@@ -0,0 +1,188 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/params.h"
+
+#include <string>
+
+#include "spdlog/spdlog.h"
+#include "yacl/crypto/hash/blake3.h"
+
+#include "psi/algorithm/spiral/arith/arith.h"
+
+namespace psi::spiral {
+
+Params::Params(std::size_t poly_len, std::vector<std::uint64_t> moduli,
+               double noise_width, PolyMatrixParams poly_matrix_params,
+               QueryParams query_params, std::size_t version)
+    : poly_len_(poly_len),
+      noise_width_(noise_width),
+      poly_matrix_params_(std::move(poly_matrix_params)),
+      query_params_(std::move(query_params)),
+      version_(version) {
+  YACL_ENFORCE(poly_matrix_params_.q2_bits_ >= kMinQ2Bits);
+
+  poly_len_log2_ = arith::Log2(poly_len_);
+
+  std::size_t crt_count = moduli.size();
+  YACL_ENFORCE(crt_count <= kMaxModuli,
+               "Current Spiral implementation support moduli size should be "
+               "less than {}",
+               kMaxModuli);
+
+  std::vector<seal::Modulus> moduli_seal;
+  for (std::size_t i = 0; i < crt_count; ++i) {
+    moduli_seal.emplace_back(moduli[i]);
+  }
+
+  // build ntt tables
+  ntt_tables_ = arith::BuildNttTables(poly_len, moduli_seal);
+
+  std::uint64_t modulus{1ULL};
+  for (const auto& m : moduli) {
+    modulus *= m;
+  }
+  seal::Modulus modulus_seal(modulus);
+
+  std::uint64_t modulus_log2 = arith::Log2Ceil(modulus);
+
+  // for crt resconstruct
+  std::uint64_t mod0_inv_mod1 = 0ULL;
+  std::uint64_t mod1_inv_mod0 = 0ULL;
+  // here implicity this code only support crt_count = 2
+  if (crt_count == 2) {
+    mod0_inv_mod1 = moduli[0] * arith::InvertUintMod(moduli[0], moduli[1]);
+    mod1_inv_mod0 = moduli[1] * arith::InvertUintMod(moduli[1], moduli[0]);
+  }
+
+  // now we can construct other parameters
+  crt_params_ =
+      CrtParams(crt_count, mod0_inv_mod1, mod1_inv_mod0, std::move(moduli_seal),
+                std::move(modulus_seal), modulus_log2);
+
+  ComputeId();
+}
+
+Params Params::ParamsWithModuli(const Params& params,
+                                std::vector<uint64_t> moduli) {
+  return Params(params.PolyLen(), std::move(moduli), params.noise_width_,
+                params.poly_matrix_params_, params.query_params_,
+                params.version_);
+}
+
+void Params::ComputeId() {
+  yacl::crypto::Blake3Hash hasher;
+  auto digest = hasher.Update(this->ToString()).CumulativeHash();
+  std::memcpy(&id_, digest.data(), sizeof(uint64_t));
+}
+
+std::size_t Params::ElementSizeOfPt(size_t element_byte_len) {
+  // one element needs how many coeffs
+  size_t coeff_size_of_element =
+      arith::UintNum(8 * element_byte_len, PtModulusBitLen());
+
+  // we have poly_len coeffs, so we can hold xx many element
+  size_t element_size_of_pt = PtCoeffs() / coeff_size_of_element;
+
+  // at least, one plaintext must hold one element
+  YACL_ENFORCE_GT(element_size_of_pt, static_cast<size_t>(0));
+
+  return element_size_of_pt;
+}
+
+void Params::UpdateByDatabaseInfo(const DatabaseMetaInfo& database_info) {
+  // here, we only consider one row of raw data can be holded by one plaintext
+  // in SpiralPIR
+  size_t element_byte_len = database_info.byte_size_per_row_;
+
+  // the number of one plaintext can hold the rows in raw database
+  size_t element_size_of_pt = ElementSizeOfPt(element_byte_len);
+
+  // we can conmpute how many plaintexts can be used to hold the whole raw
+  // database
+  size_t plaintext_size =
+      arith::UintNum(database_info.rows_, element_size_of_pt);
+
+  SPDLOG_INFO("total pt size: {}", plaintext_size);
+
+  size_t v1 = 0;
+  size_t v2 = 0;
+  // now we need to adjust the v1 & v2 to satisfy 2^(v1 + v2) >= plaintext
+  if (plaintext_size <= 4) {
+    v1 = 1;
+    v2 = 1;
+  } else {
+    uint64_t log2 = arith::Log2Ceil(static_cast<uint64_t>(plaintext_size));
+    v1 = static_cast<size_t>(log2 * 0.6);
+    v2 = static_cast<size_t>(log2 * 0.4);
+    // double check
+    while (static_cast<size_t>(1 << (v1 + v2)) < plaintext_size) {
+      v2 += 1;
+    }
+  }
+  // set v1 and v2
+  SetDbDim1(v1);
+  SetDbDim2(v2);
+
+  YACL_ENFORCE(IsValid(), "Current params object is not valid, please check");
+  // reset id
+  ComputeId();
+}
+
+std::uint64_t Params::CrtCompose2(std::uint64_t x, std::uint64_t y) const {
+  YACL_ENFORCE(crt_params_.crt_count_ == 2);
+
+  uint128_t x_128 = yacl::MakeUint128(0, x);
+  uint128_t y_128 = yacl::MakeUint128(0, y);
+  auto val = x_128 * yacl::MakeUint128(0, crt_params_.mod1_inv_mod0_);
+
+  val += (y_128 * yacl::MakeUint128(0, crt_params_.mod0_inv_mod1_));
+
+  // reduce
+  return arith::BarrettReductionU128Raw(val, BarrettCr0Modulus(),
+                                        BarrettCr1Modulus(),
+                                        crt_params_.modulus_.value());
+}
+
+std::uint64_t Params::CrtCompose(const std::vector<std::uint64_t>& a,
+                                 std::size_t idx) const {
+  if (crt_params_.crt_count_ == 1) {
+    return CrtCompose1(a[idx]);
+  } else {
+    return CrtCompose2(a[idx], a[idx + poly_len_]);
+  }
+}
+
+[[nodiscard]] std::string Params::ToString() {
+  std::ostringstream ss;
+
+  ss << "poly_len: " << poly_len_;
+  ss << ", Pt dimension: " << N() << ", pt modulus: " << PtModulus() << "\n";
+
+  ss << "modulus: " << Modulus() << ", moduli: {";
+  for (size_t i = 0; i < CrtCount(); ++i) {
+    ss << Moduli(i) << (i + 1 == CrtCount() ? "" : ", ");
+  }
+  ss << "}\n";
+
+  ss << "t_conv: " << TConv() << ", t_exp_left: " << TExpLeft()
+     << ", t_exp_right: " << TExpRight() << ", t_gsw: " << TGsw() << "\n";
+
+  ss << "expand_query: " << (ExpandQuery() ? "true" : "false")
+     << ", v1: " << DbDim1() << ", v2: " << DbDim2();
+
+  return ss.str();
+}
+
+}  // namespace psi::spiral
diff --git a/psi/algorithm/spiral/params.h b/psi/algorithm/spiral/params.h
new file mode 100644
index 0000000..1d37401
--- /dev/null
+++ b/psi/algorithm/spiral/params.h
@@ -0,0 +1,383 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <utility>
+#include <vector>
+
+#include "seal/modulus.h"
+#include "spdlog/spdlog.h"
+#include "yacl/base/exception.h"
+#include "yacl/base/int128.h"
+#include "yacl/utils/elapsed_timer.h"
+
+#include "psi/algorithm/spiral/arith/arith.h"
+#include "psi/algorithm/spiral/arith/ntt_table.h"
+#include "psi/algorithm/spiral/arith/number_theory.h"
+
+namespace psi::spiral {
+
+// Use ParamsId to uniquely label a Params object
+using ParamsId = std::uint64_t;
+
+struct DatabaseMetaInfo {
+  size_t rows_ = 0;
+  // byte length of each row
+  size_t byte_size_per_row_ = 0;
+
+  DatabaseMetaInfo() = default;
+
+  DatabaseMetaInfo(size_t rows, size_t byte_size_per_row)
+      : rows_(rows), byte_size_per_row_(byte_size_per_row) {}
+};
+
+struct CrtParams {
+  // RNS moduli num
+  std::size_t crt_count_ = 0;
+
+  // for crt reconstruct
+  std::uint64_t mod0_inv_mod1_ = 0;
+
+  // for crt reconstruct
+  std::uint64_t mod1_inv_mod0_ = 0;
+
+  // RNS moduli: qi
+  std::vector<seal::Modulus> moduli_;
+
+  // moduls q = q1 * q2
+  seal::Modulus modulus_;
+
+  // log2(q)
+  std::uint64_t modulus_log2_ = 0;
+
+  CrtParams() = default;
+
+  // 构造函数
+  CrtParams(std::size_t crt_count, std::uint64_t mod0_inv_mod1,
+            std::uint64_t mod1_inv_mod0, std::vector<seal::Modulus> moduli,
+            seal::Modulus modulus, std::uint64_t modulus_log2)
+      : crt_count_(crt_count),
+        mod0_inv_mod1_(mod0_inv_mod1),
+        mod1_inv_mod0_(mod1_inv_mod0),
+        moduli_(std::move(moduli)),
+        modulus_(std::move(modulus)),
+        modulus_log2_(modulus_log2) {}
+};
+
+struct PolyMatrixParams {
+  // plaintext dimension, default value is 2
+  std::size_t n_ = 2;
+
+  // plaintext modulus p
+  std::uint64_t pt_modulus_ = 256;
+
+  // q2 used in modulus switching for response
+  std::uint64_t q2_bits_ = 20;
+
+  // z_conv in Spiral paper, used to genedate Gadget Matrix
+  // t_Conv = log_{z_Conv] q} + 1
+  std::size_t t_conv_ = 4;
+
+  // t_{expLeft} = log_{z_{expLeft}} + 1, given t, then we can calculate z
+  std::size_t t_exp_left_ = 8;
+
+  // t_{expRight} = log_{z_{expRight}} + 1
+  std::size_t t_exp_right_ = 8;
+
+  // t_{GSW} = log_{z_{GSW} q} + 1
+  std::size_t t_gsw_ = 8;
+
+  PolyMatrixParams() = default;
+
+  PolyMatrixParams(std::size_t n, std::uint64_t pt_modulus,
+                   std::uint64_t q2_bits, std::size_t t_conv,
+                   std::size_t t_exp_left, std::size_t t_exp_right,
+                   std::size_t t_gsw)
+      : n_(n),
+        pt_modulus_(pt_modulus),
+        q2_bits_(q2_bits),
+        t_conv_(t_conv),
+        t_exp_left_(t_exp_left),
+        t_exp_right_(t_exp_right),
+        t_gsw_(t_gsw) {}
+};
+
+struct QueryParams {
+  // default true
+  bool expand_queries_ = true;
+
+  // use the rng_pub related seed to compress the SpiralQuery size
+  bool query_seed_compressed_ = true;
+
+  // database shape: v1
+  std::size_t db_dim1_ = 1;
+
+  // database shape: v2
+  std::size_t db_dim2_ = 1;
+
+  // the number of sub-database, T
+  // in our kernel level, we only process the case T = 1
+  std::size_t instances_ = 1;
+
+  // seem no usage? consider remove it
+  std::size_t db_item_size_ = 1;
+
+  QueryParams() = default;
+
+  QueryParams(std::size_t db_dim_1, std::size_t db_dim_2, std::size_t instances,
+              std::size_t db_item_size, bool expand_queries = true,
+              bool query_seed_compressed = true)
+      : expand_queries_(expand_queries),
+        query_seed_compressed_(query_seed_compressed),
+        db_dim1_(db_dim_1),
+        db_dim2_(db_dim_2),
+        instances_(instances),
+        db_item_size_(db_item_size) {}
+};
+
+class Params {
+ public:
+  Params(std::size_t poly_len, std::vector<std::uint64_t> moduli,
+         double noise_width, PolyMatrixParams poly_matrix_params,
+         QueryParams query_params, std::size_t version);
+
+  void UpdateByDatabaseInfo(const DatabaseMetaInfo& database_info);
+
+  [[nodiscard]] bool IsValid() const {
+    return G() <= poly_len_log2_ && DbDim1() <= kMaxDbDim &&
+           DbDim2() <= kMaxDbDim;
+  }
+
+  static Params ParamsWithModuli(const Params& params,
+                                 std::vector<uint64_t> moduli);
+
+  bool operator==(const Params& other) const { return id_ == other.id_; }
+
+  bool operator!=(const Params& other) const { return !(*this == other); }
+
+  [[nodiscard]] const std::vector<std::uint64_t>& GetNttForwardTable(
+      std::size_t i) const {
+    return ntt_tables_[i][0];
+  }
+
+  [[nodiscard]] const std::vector<std::uint64_t>& GetNttForwardPrimeTable(
+      std::size_t i) const {
+    return ntt_tables_[i][1];
+  }
+
+  [[nodiscard]] const std::vector<std::uint64_t>& GetNttInverseTable(
+      std::size_t i) const {
+    return ntt_tables_[i][2];
+  }
+  [[nodiscard]] const std::vector<std::uint64_t>& GetNttInversePrimeTable(
+      std::size_t i) const {
+    return ntt_tables_[i][3];
+  }
+
+  [[nodiscard]] std::pair<std::size_t, std::size_t> GetSkGswDims() const {
+    return {poly_matrix_params_.n_, 1};
+  };
+
+  [[nodiscard]] uint64_t Moduli(size_t c) const {
+    return crt_params_.moduli_[c].value();
+  }
+
+  uint64_t Moduli(size_t c) { return crt_params_.moduli_[c].value(); }
+
+  [[nodiscard]] const seal::Modulus& ModuliSeal(size_t c) const {
+    return crt_params_.moduli_[c];
+  }
+
+  [[nodiscard]] std::pair<std::size_t, std::size_t> GetSkRegDims() const {
+    return {1, 1};
+  }
+
+  [[nodiscard]] std::size_t NumExpanded() const {
+    return static_cast<size_t>(1) << query_params_.db_dim1_;
+  }
+
+  [[nodiscard]] std::size_t NumItems() const {
+    return (static_cast<size_t>(1) << query_params_.db_dim1_) *
+           (static_cast<size_t>(1) << query_params_.db_dim2_);
+  }
+
+  [[nodiscard]] std::size_t ItemSize() const {
+    auto log_p = arith::Log2(poly_matrix_params_.pt_modulus_);
+
+    return query_params_.instances_ * poly_matrix_params_.n_ *
+           poly_matrix_params_.n_ * poly_len_ * log_p / 8;
+  }
+
+  [[nodiscard]] std::size_t G() const {
+    // v_2 * t_{GSW} + 2^{v_1}, should <= d
+    auto num_bits_to_gen =
+        poly_matrix_params_.t_gsw_ * query_params_.db_dim2_ + NumExpanded();
+
+    return static_cast<std::size_t>(arith::Log2Ceil(num_bits_to_gen));
+  }
+
+  [[nodiscard]] std::size_t StopRound() const {
+    return static_cast<std::size_t>(
+        arith::Log2Ceil(poly_matrix_params_.t_gsw_ * query_params_.db_dim2_));
+  }
+
+  [[nodiscard]] uint64_t ScaleK() const {
+    return crt_params_.modulus_.value() / poly_matrix_params_.pt_modulus_;
+  }
+
+  [[nodiscard]] size_t PtCoeffs() const {
+    return poly_matrix_params_.n_ * poly_matrix_params_.n_ * poly_len_;
+  }
+
+  [[nodiscard]] bool ExpandQuery() const {
+    return query_params_.expand_queries_;
+  }
+
+  [[nodiscard]] size_t N() const { return poly_matrix_params_.n_; };
+
+  [[nodiscard]] size_t Q2Bits() const { return poly_matrix_params_.q2_bits_; };
+
+  [[nodiscard]] size_t Instances() const { return query_params_.instances_; };
+
+  [[nodiscard]] uint64_t TConv() const { return poly_matrix_params_.t_conv_; }
+  [[nodiscard]] uint64_t TGsw() const { return poly_matrix_params_.t_gsw_; }
+  [[nodiscard]] uint64_t TExpLeft() const {
+    return poly_matrix_params_.t_exp_left_;
+  }
+  [[nodiscard]] uint64_t TExpRight() const {
+    return poly_matrix_params_.t_exp_right_;
+  }
+
+  [[nodiscard]] size_t CrtCount() const { return crt_params_.crt_count_; }
+
+  [[nodiscard]] uint64_t DbDim2() const { return query_params_.db_dim2_; }
+
+  [[nodiscard]] uint64_t DbDim1() const { return query_params_.db_dim1_; }
+
+  [[nodiscard]] uint64_t Modulus() const {
+    return crt_params_.modulus_.value();
+  }
+
+  [[nodiscard]] const seal::Modulus& ModulusSeal() const {
+    return crt_params_.modulus_;
+  }
+
+  [[nodiscard]] uint64_t PtModulus() const {
+    return poly_matrix_params_.pt_modulus_;
+  }
+
+  [[nodiscard]] std::size_t FactorOnFirstDim() const {
+    if (query_params_.db_dim2_ == 0) {
+      return 1;
+    } else {
+      return 2;
+    }
+  }
+
+  [[nodiscard]] std::uint64_t CrtCompose1(std::uint64_t x) const {
+    YACL_ENFORCE(crt_params_.crt_count_ == 1);
+    return x;
+  }
+
+  [[nodiscard]] std::uint64_t CrtCompose2(std::uint64_t x,
+                                          std::uint64_t y) const;
+
+  std::uint64_t CrtCompose(const std::vector<std::uint64_t>& a,
+                           std::size_t idx) const;
+
+  /// other util methods
+
+  [[nodiscard]] std::string ToString();
+
+  [[nodiscard]] std::uint64_t PtModulusBitLen() const {
+    return arith::Log2(poly_matrix_params_.pt_modulus_);
+  }
+
+  // the number of raw data rows that one Pt can hold
+  std::size_t ElementSizeOfPt(size_t element_byte_len);
+
+  std::size_t PolyLen() const { return poly_len_; }
+
+  std::size_t PolyLenLog2() const { return poly_len_log2_; }
+
+  std::uint64_t ModulusLog2() const { return crt_params_.modulus_log2_; }
+
+  // const ratio 1 in Barrett Reduce, for large modulus
+  std::uint64_t BarrettCr1Modulus() const {
+    return crt_params_.modulus_.const_ratio()[1];
+  }
+
+  // const ratio 0 in Barrett Reduce, for large modulus
+  std::uint64_t BarrettCr0Modulus() const {
+    return crt_params_.modulus_.const_ratio()[0];
+  };
+
+  // const ratio 0 in Barrett Reduce, for ecah moduli
+  std::uint64_t BarrettCr1(size_t i) const {
+    return crt_params_.moduli_[i].const_ratio()[1];
+  }
+
+  // const ratio 1 in Barrett Reduce, for ecah moduli
+  std::uint64_t BarrettCr0(size_t i) const {
+    return crt_params_.moduli_[i].const_ratio()[0];
+  }
+
+  double NoiseWidth() const { return noise_width_; }
+
+  std::size_t Version() const { return version_; }
+
+  std::uint64_t Mod0InvMod1() const { return crt_params_.mod0_inv_mod1_; }
+
+  std::uint64_t Mod1InvMod0() const { return crt_params_.mod1_inv_mod0_; }
+
+  ParamsId Id() const { return id_; }
+
+ private:
+  void ComputeId();
+  void SetDbDim1(size_t v1) { query_params_.db_dim1_ = v1; }
+  void SetDbDim2(size_t v2) { query_params_.db_dim2_ = v2; }
+
+  // d in R = Z[x]/x^d + 1
+  std::size_t poly_len_ = 0;
+
+  // log2(d)
+  std::size_t poly_len_log2_ = 0;
+
+  // Ntt tables for given RNS moduli
+  arith::NttTables ntt_tables_;
+
+  // crt params for RNS related computation required
+  CrtParams crt_params_;
+
+  // noise for Discrete Gaussian
+  double noise_width_ = 6.4;
+
+  // PolyMatrix related params
+  PolyMatrixParams poly_matrix_params_;
+
+  // Query related params
+  QueryParams query_params_;
+
+  // seem no usage? consider remove it
+  std::size_t version_ = 0;
+
+  ParamsId id_ = 0;
+};
+
+}  // namespace psi::spiral
diff --git a/psi/algorithm/spiral/params_test.cc b/psi/algorithm/spiral/params_test.cc
new file mode 100644
index 0000000..e64a582
--- /dev/null
+++ b/psi/algorithm/spiral/params_test.cc
@@ -0,0 +1,72 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/params.h"
+
+#include <array>
+#include <cstdint>
+
+#include "gtest/gtest.h"
+
+#include "psi/algorithm/spiral/arith/ntt_table.h"
+#include "psi/algorithm/spiral/util.h"
+
+namespace psi::spiral {
+
+namespace {
+
+constexpr std::array<std::uint64_t, 2> kFastExpansionBarrettCr0{
+    0xe00d10c30b5fa9c9, 0x98328d11bc1054db};
+constexpr std::array<std::uint64_t, 2> kFastExpansionBarrettCr1{68736257792,
+                                                                73916747789};
+
+constexpr std::uint64_t kFastExpansionBarrettCr0Modulus{7906011006380390721};
+constexpr std::uint64_t kFastExpansionBarrettCr1Modulus{275};
+constexpr std::uint64_t kFastExpansionMod0InvMod1{26136460727815280};
+constexpr std::uint64_t kFastExpansionMod1InvMod0{40838229011788690};
+constexpr std::uint64_t kFastExpansionModulus{66974689739603969};
+constexpr std::uint64_t kFastExpansionModulusLog2{56};
+
+}  // namespace
+
+TEST(ParamsTest, Correct) {
+  auto params = util::GetFastExpansionTestingParam();
+
+  EXPECT_EQ(11, params.PolyLenLog2());
+  for (size_t i = 0; i < kFastExpansionBarrettCr0.size(); ++i) {
+    EXPECT_EQ(kFastExpansionBarrettCr0[i], params.BarrettCr0(i));
+    EXPECT_EQ(kFastExpansionBarrettCr1[i], params.BarrettCr1(i));
+  }
+
+  EXPECT_EQ(kFastExpansionBarrettCr0Modulus, params.BarrettCr0Modulus());
+  EXPECT_EQ(kFastExpansionBarrettCr1Modulus, params.BarrettCr1Modulus());
+  EXPECT_EQ(kFastExpansionMod0InvMod1, params.Mod0InvMod1());
+  EXPECT_EQ(kFastExpansionMod1InvMod0, params.Mod1InvMod0());
+  EXPECT_EQ(kFastExpansionModulus, params.Modulus());
+  EXPECT_EQ(kFastExpansionModulusLog2, params.ModulusLog2());
+}
+
+TEST(ParamsTest, Id) {
+  auto params = util::GetFastExpansionTestingParam();
+  auto params2 = util::GetFastExpansionTestingParam();
+  auto params3 = util::GetTestParam();
+
+  ASSERT_TRUE(params == params2);
+  ASSERT_TRUE(params != params3);
+
+  EXPECT_EQ(params.Id(), params2.Id());
+  EXPECT_NE(params.Id(), params3.Id());
+}
+
+}  // namespace psi::spiral
diff --git a/psi/algorithm/spiral/poly_matrix.cc b/psi/algorithm/spiral/poly_matrix.cc
new file mode 100644
index 0000000..288ee0c
--- /dev/null
+++ b/psi/algorithm/spiral/poly_matrix.cc
@@ -0,0 +1,666 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/poly_matrix.h"
+
+#ifdef __x86_64__
+#include <immintrin.h>
+#elif defined(__aarch64__)
+#include "sse2neon.h"
+#endif
+
+#include "absl/types/span.h"
+#include "seal/modulus.h"
+#include "yacl/base/exception.h"
+#include "yacl/crypto/rand/rand.h"
+
+#include "psi/algorithm/spiral/arith/arith_params.h"
+#include "psi/algorithm/spiral/arith/ntt.h"
+#include "psi/algorithm/spiral/util.h"
+
+namespace psi::spiral {
+
+// PolyMatrixRaw
+
+void PolyMatrixRaw::CopyInto(const PolyMatrixRaw& p, size_t target_row,
+                             size_t target_col) {
+  WEAK_ENFORCE(target_row < rows_);
+  WEAK_ENFORCE(target_col < cols_);
+  WEAK_ENFORCE(target_row + p.Rows() <= rows_);
+  WEAK_ENFORCE(target_col + p.Cols() <= cols_);
+  WEAK_ENFORCE(p.NumWords() == this->NumWords());
+  // copy each poly
+  for (size_t r = 0; r < p.Rows(); ++r) {
+    for (size_t c = 0; c < p.Cols(); ++c) {
+      size_t src_idx = p.PolyStartIndex(r, c);
+      size_t dest_idx = this->PolyStartIndex(target_row + r, target_col + c);
+      std::memcpy(data_.data() + dest_idx, p.Data().data() + src_idx,
+                  NumWords() * sizeof(uint64_t));
+    }
+  }
+}
+
+PolyMatrixRaw PolyMatrixRaw::SubMatrix(size_t target_row, size_t target_col,
+                                       size_t rows, size_t cols) const {
+  WEAK_ENFORCE(target_row < rows_);
+  WEAK_ENFORCE(target_col < cols_);
+  WEAK_ENFORCE(target_row + rows <= rows_);
+  WEAK_ENFORCE(target_col + cols <= cols_);
+
+  PolyMatrixRaw sub_matrix = PolyMatrixRaw::Zero(poly_len_, rows, cols);
+  for (size_t r = 0; r < rows; ++r) {
+    for (size_t c = 0; c < cols; ++c) {
+      size_t src_idx = this->PolyStartIndex(target_row + r, target_col + c);
+      size_t dest_idx = sub_matrix.PolyStartIndex(r, c);
+      // copy each poly
+      std::memcpy(sub_matrix.data_.data() + dest_idx, data_.data() + src_idx,
+                  NumWords() * sizeof(uint64_t));
+    }
+  }
+  return sub_matrix;
+}
+
+PolyMatrixRaw PolyMatrixRaw::PadTop(size_t pad_rows) {
+  auto padded = PolyMatrixRaw::Zero(poly_len_, rows_ + pad_rows, cols_);
+  padded.CopyInto(*this, pad_rows, 0);
+  return padded;
+}
+
+void PolyMatrixRaw::ReduceMod(uint64_t modulus) {
+  WEAK_ENFORCE(modulus > 0);
+  for (size_t r = 0; r < rows_; ++r) {
+    for (size_t c = 0; c < cols_; ++c) {
+      auto poly_idx = PolyStartIndex(r, c);
+      for (size_t z = 0; z < NumWords(); z++) {
+        data_[poly_idx + z] %= modulus;
+      }
+    }
+  }
+}
+
+void PolyMatrixRaw::ReduceMod(const seal::Modulus& modulus) {
+  for (size_t r = 0; r < rows_; ++r) {
+    for (size_t c = 0; c < cols_; ++c) {
+      auto poly_idx = PolyStartIndex(r, c);
+      for (size_t z = 0; z < NumWords(); z++) {
+        data_[poly_idx + z] = arith::BarrettRawU64(
+            data_[poly_idx + z], modulus.const_ratio()[1], modulus.value());
+      }
+    }
+  }
+}
+
+absl::Span<uint64_t> PolyMatrixRaw::Poly(size_t r, size_t c) {
+  auto start_idx = PolyStartIndex(r, c);
+  return absl::MakeSpan(data_.data() + start_idx, NumWords());
+}
+
+absl::Span<const uint64_t> PolyMatrixRaw::Poly(size_t r, size_t c) const {
+  auto start_idx = PolyStartIndex(r, c);
+  return absl::MakeSpan(data_.data() + start_idx, NumWords());
+}
+
+void PolyMatrixRaw::Rescale(size_t start_row, size_t end_row,
+                            uint64_t in_modulus, uint64_t out_modulus) {
+  WEAK_ENFORCE(start_row < end_row);
+  WEAK_ENFORCE(end_row <= rows_);
+
+  for (size_t r = start_row; r < end_row; ++r) {
+    for (size_t c = 0; c < cols_; ++c) {
+      size_t poly_idx = PolyStartIndex(r, c);
+      for (size_t i = 0; i < NumWords(); ++i) {
+        data_[poly_idx + i] =
+            arith::Rescale(data_[poly_idx + i], in_modulus, out_modulus);
+      }
+    }
+  }
+}
+
+void PolyMatrixRaw::Reset(size_t poly_len, size_t rows, size_t cols) {
+  poly_len_ = poly_len;
+  rows_ = rows;
+  cols_ = cols;
+  data_.resize(poly_len * rows * cols);
+  std::fill(data_.begin(), data_.end(), 0);
+}
+
+PolyMatrixRaw PolyMatrixRaw::Identity(size_t poly_len, size_t rows,
+                                      size_t cols) {
+  WEAK_ENFORCE(rows == cols);
+
+  size_t num_coeffs = rows * cols * poly_len;
+  std::vector<uint64_t> data(num_coeffs);
+  for (size_t r = 0; r < rows; ++r) {
+    size_t c = r;
+    size_t idx = r * cols * poly_len + c * poly_len;
+    // set the poly at (r, r) to be 1
+    data[idx] = 1ULL;
+  }
+  return PolyMatrixRaw(poly_len, rows, cols, std::move(data));
+}
+
+PolyMatrixRaw PolyMatrixRaw::SingleValue(size_t poly_len, uint64_t value) {
+  PolyMatrixRaw out = PolyMatrixRaw::Zero(poly_len, 1, 1);
+  out.data_[0] = value;
+  return out;
+}
+
+PolyMatrixRaw PolyMatrixRaw::Random(const Params& params, size_t rows,
+                                    size_t cols) {
+  yacl::crypto::Prg<uint64_t> prg(yacl::crypto::SecureRandU128());
+  return RandomPrg(params, rows, cols, prg);
+}
+
+PolyMatrixRaw PolyMatrixRaw::RandomPrg(const Params& params, size_t rows,
+                                       size_t cols,
+                                       yacl::crypto::Prg<uint64_t>& prg) {
+  PolyMatrixRaw out = PolyMatrixRaw::Zero(params.PolyLen(), rows, cols);
+  prg.Fill(out.Data());
+  out.ReduceMod(params.ModulusSeal());
+  return out;
+}
+
+//--------------- PolyMatrixNtt
+
+void PolyMatrixNtt::CopyInto(const PolyMatrixNtt& p, size_t target_row,
+                             size_t target_col) {
+  WEAK_ENFORCE(target_row < rows_);
+  WEAK_ENFORCE(target_col < cols_);
+  WEAK_ENFORCE(target_row + p.Rows() <= rows_);
+  WEAK_ENFORCE(target_col + p.Cols() <= cols_);
+  WEAK_ENFORCE(p.NumWords() == this->NumWords());
+  // copy each poly
+  for (size_t r = 0; r < p.Rows(); ++r) {
+    for (size_t c = 0; c < p.Cols(); ++c) {
+      size_t src_idx = p.PolyStartIndex(r, c);
+      size_t dest_idx = this->PolyStartIndex(target_row + r, target_col + c);
+      std::memcpy(data_.data() + dest_idx, p.Data().data() + src_idx,
+                  NumWords() * sizeof(uint64_t));
+    }
+  }
+}
+
+PolyMatrixNtt PolyMatrixNtt::SubMatrix(size_t target_row, size_t target_col,
+                                       size_t rows, size_t cols) const {
+  WEAK_ENFORCE(target_row < rows_);
+  WEAK_ENFORCE(target_col < cols_);
+  WEAK_ENFORCE(target_row + rows <= rows_);
+  WEAK_ENFORCE(target_col + cols <= cols_);
+
+  PolyMatrixNtt sub_matrix =
+      PolyMatrixNtt::Zero(crt_count_, poly_len_, rows, cols);
+  for (size_t r = 0; r < rows; ++r) {
+    for (size_t c = 0; c < cols; ++c) {
+      size_t src_idx = this->PolyStartIndex(target_row + r, target_col + c);
+      size_t dest_idx = sub_matrix.PolyStartIndex(r, c);
+      std::memcpy(sub_matrix.data_.data() + dest_idx, data_.data() + src_idx,
+                  NumWords() * sizeof(uint64_t));
+    }
+  }
+  return sub_matrix;
+}
+
+PolyMatrixNtt PolyMatrixNtt::PadTop(size_t pad_rows) {
+  auto padded =
+      PolyMatrixNtt::Zero(crt_count_, poly_len_, rows_ + pad_rows, cols_);
+  padded.CopyInto(*this, pad_rows, 0);
+  return padded;
+}
+
+absl::Span<uint64_t> PolyMatrixNtt::Poly(size_t r, size_t c) {
+  auto start_idx = PolyStartIndex(r, c);
+  return absl::MakeSpan(data_.data() + start_idx, NumWords());
+}
+
+absl::Span<const uint64_t> PolyMatrixNtt::Poly(size_t r, size_t c) const {
+  auto start_idx = PolyStartIndex(r, c);
+  return absl::MakeSpan(data_.data() + start_idx, NumWords());
+}
+
+PolyMatrixNtt PolyMatrixNtt::Random(const Params& params, size_t rows,
+                                    size_t cols) {
+  yacl::crypto::Prg<uint64_t> prg(yacl::crypto::SecureRandU128());
+  return RandomPrg(params, rows, cols, prg);
+}
+
+PolyMatrixNtt PolyMatrixNtt::RandomPrg(const Params& params, size_t rows,
+                                       size_t cols,
+                                       yacl::crypto::Prg<uint64_t>& prg) {
+  PolyMatrixNtt out =
+      PolyMatrixNtt::Zero(params.CrtCount(), params.PolyLen(), rows, cols);
+  prg.Fill(out.Data());
+  // reduce
+  for (size_t r = 0; r < rows; ++r) {
+    for (size_t c = 0; c < cols; ++c) {
+      auto poly_idx = out.PolyStartIndex(r, c);
+      for (size_t i = 0; i < params.CrtCount(); ++i) {
+        for (size_t j = 0; j < params.PolyLen(); ++j) {
+          auto ele_idx =
+              util::CalcIndex({i, j}, {params.CrtCount(), params.PolyLen()});
+          out.data_[poly_idx + ele_idx] =
+              arith::BarrettRawU64(out.data_[poly_idx + ele_idx],
+                                   params.BarrettCr1(i), params.Moduli(i));
+        }
+      }
+    }
+  }
+
+  return out;
+}
+
+//----some utils method for PolyMatrix-----
+
+void MultiplyPoly(const Params& params, absl::Span<uint64_t> res,
+                  absl::Span<const uint64_t> a, absl::Span<const uint64_t> b) {
+  // todo: try parallel
+  for (size_t c = 0; c < params.CrtCount(); ++c) {
+    for (size_t i = 0; i < params.PolyLen(); ++i) {
+      size_t idx = c * params.PolyLen() + i;
+      res[idx] = arith::MultiplyModular(params, a[idx], b[idx], c);
+    }
+  }
+}
+
+#ifndef __AVX2__
+
+void MultiplyAddPoly(const Params& params, absl::Span<uint64_t> res,
+                     absl::Span<const uint64_t> a,
+                     absl::Span<const uint64_t> b) {
+  WEAK_ENFORCE(res.size() == a.size());
+  WEAK_ENFORCE(res.size() == b.size());
+  WEAK_ENFORCE(res.size() == params.CrtCount() * params.PolyLen());
+  //
+  for (size_t c = 0; c < params.CrtCount(); ++c) {
+    for (size_t i = 0; i < params.PolyLen(); ++i) {
+      size_t idx = c * params.PolyLen() + i;
+      res[idx] = arith::MultiplyAddModular(params, a[idx], b[idx], res[idx], c);
+    }
+  }
+}
+
+#else
+
+void MultiplyAddPolyAvx2(const Params& params, absl::Span<uint64_t> res,
+                         absl::Span<const uint64_t> a,
+                         absl::Span<const uint64_t> b) {
+  WEAK_ENFORCE(res.size() == a.size());
+  WEAK_ENFORCE(res.size() == b.size());
+  WEAK_ENFORCE(res.size() == params.CrtCount() * params.PolyLen());
+
+  for (size_t c = 0; c < params.CrtCount(); ++c) {
+    for (size_t i = 0; i < params.PolyLen(); i += 4) {
+      const __m256i* p_x =
+          reinterpret_cast<const __m256i*>(&a[c * params.PolyLen() + i]);
+      const __m256i* p_y =
+          reinterpret_cast<const __m256i*>(&b[c * params.PolyLen() + i]);
+      __m256i* p_z = reinterpret_cast<__m256i*>(&res[c * params.PolyLen() + i]);
+
+      // Load the data into AVX2 registers
+      __m256i x = _mm256_loadu_si256(p_x);
+      __m256i y = _mm256_loadu_si256(p_y);
+      __m256i z = _mm256_loadu_si256(p_z);
+
+      // Perform the multiplication and addition
+      __m256i product = _mm256_mul_epu32(x, y);
+      __m256i out = _mm256_add_epi64(z, product);
+      _mm256_storeu_si256(p_z, out);
+    }
+  }
+}
+
+void MultiplyAddPoly(const Params& params, absl::Span<uint64_t> res,
+                     absl::Span<const uint64_t> a,
+                     absl::Span<const uint64_t> b) {
+  MultiplyAddPolyAvx2(params, res, a, b);
+  ReducePoly(params, res);
+}
+
+#endif
+
+void AddPoly(const Params& params, absl::Span<uint64_t> res,
+             absl::Span<const uint64_t> a, absl::Span<const uint64_t> b) {
+  for (size_t c = 0; c < params.CrtCount(); ++c) {
+    for (size_t i = 0; i < params.PolyLen(); ++i) {
+      size_t idx = c * params.PolyLen() + i;
+      res[idx] = arith::AddModular(params, a[idx], b[idx], c);
+    }
+  }
+}
+
+void AddPolyInto(const Params& params, absl::Span<uint64_t> res,
+                 absl::Span<const uint64_t> a) {
+  for (size_t c = 0; c < params.CrtCount(); ++c) {
+    for (size_t i = 0; i < params.PolyLen(); ++i) {
+      size_t idx = c * params.PolyLen() + i;
+      res[idx] = arith::AddModular(params, a[idx], res[idx], c);
+    }
+  }
+}
+
+void InvertPoly(const Params& params, absl::Span<uint64_t> res,
+                absl::Span<const uint64_t> a) {
+  for (size_t i = 0; i < params.PolyLen(); ++i) {
+    res[i] = params.Modulus() - a[i];
+  }
+}
+
+void AutomotphPoly(const Params& params, absl::Span<uint64_t> res,
+                   absl::Span<const uint64_t> a, size_t t) {
+  auto poly_len = params.PolyLen();
+  for (size_t i = 0; i < poly_len; ++i) {
+    uint64_t num = (i * t) / poly_len;
+    uint64_t rem = (i * t) % poly_len;
+    if (num % 2 == 0) {
+      res[rem] = a[i];
+    } else {
+      res[rem] = params.Modulus() - a[i];
+    }
+  }
+}
+
+void ReduceCopy(const Params& params, absl::Span<uint64_t> res,
+                absl::Span<const uint64_t> in) {
+  for (size_t i = 0; i < params.CrtCount(); ++i) {
+    for (size_t j = 0; j < params.PolyLen(); ++j) {
+      res[i * params.PolyLen() + j] = arith::BarrettCoeffU64(params, in[j], i);
+    }
+  }
+}
+
+void ReducePoly(const Params& params, absl::Span<uint64_t> res) {
+  WEAK_ENFORCE(res.size() == params.CrtCount() * params.PolyLen());
+  for (size_t c = 0; c < params.CrtCount(); ++c) {
+    for (size_t i = 0; i < params.PolyLen(); ++i) {
+      size_t idx = c * params.PolyLen() + i;
+      res[idx] = arith::BarrettCoeffU64(params, res[idx], c);
+    }
+  }
+}
+
+PolyMatrixNtt ShiftRowsByOne(const PolyMatrixNtt& in) {
+  if (in.Rows() == 1) {
+    return PolyMatrixNtt(in);
+  }
+  auto sub_rows = in.SubMatrix(0, 0, in.Rows() - 1, in.Cols());
+  auto last_row = in.SubMatrix(in.Rows() - 1, 0, 1, in.Cols());
+  auto out = StackNtt(last_row, sub_rows);
+  return out;
+}
+
+PolyMatrixNtt StackNtt(const PolyMatrixNtt& a, const PolyMatrixNtt& b) {
+  WEAK_ENFORCE(a.Cols() == b.Cols());
+  auto c = PolyMatrixNtt::Zero(a.CrtCount(), a.PolyLen(), a.Rows() + b.Rows(),
+                               a.Cols());
+  c.CopyInto(a, 0, 0);
+  c.CopyInto(b, a.Rows(), 0);
+  return c;
+}
+
+PolyMatrixRaw Stack(const PolyMatrixRaw& a, const PolyMatrixRaw& b) {
+  WEAK_ENFORCE(a.Cols() == b.Cols());
+  auto c = PolyMatrixRaw::Zero(a.PolyLen(), a.Rows() + b.Rows(), a.Cols());
+  c.CopyInto(a, 0, 0);
+  c.CopyInto(b, a.Rows(), 0);
+  return c;
+}
+
+void ScalarMultiply(const Params& params, PolyMatrixNtt& res,
+                    const PolyMatrixNtt& a, const PolyMatrixNtt& b) {
+  WEAK_ENFORCE(a.Rows() == 1 && a.Cols() == 1);
+  auto poly_a = a.Poly(0, 0);
+  for (size_t i = 0; i < b.Rows(); ++i) {
+    for (size_t j = 0; j < b.Cols(); ++j) {
+      auto poly_b = b.Poly(i, j);
+      auto poly_res = res.Poly(i, j);
+      // mul
+      MultiplyPoly(params, poly_res, poly_a, poly_b);
+    }
+  }
+}
+
+PolyMatrixNtt ScalarMultiply(const Params& params, const PolyMatrixNtt& a,
+                             const PolyMatrixNtt& b) {
+  PolyMatrixNtt res =
+      PolyMatrixNtt::Zero(b.CrtCount(), b.PolyLen(), b.Rows(), b.Cols());
+  ScalarMultiply(params, res, a, b);
+  return res;
+}
+
+void Automorphism(const Params& params, PolyMatrixRaw& res,
+                  const PolyMatrixRaw& a, size_t t) {
+  WEAK_ENFORCE(res.Rows() == a.Rows());
+  WEAK_ENFORCE(res.Cols() == a.Cols());
+
+  // handle each poly
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < a.Cols(); ++j) {
+      auto poly_a = a.Poly(i, j);
+      auto poly_res = res.Poly(i, j);
+      AutomotphPoly(params, poly_res, poly_a, t);
+    }
+  }
+}
+
+PolyMatrixRaw Automorphism(const Params& params, const PolyMatrixRaw& a,
+                           size_t t) {
+  PolyMatrixRaw res = PolyMatrixRaw::Zero(a.PolyLen(), a.Rows(), a.Cols());
+  Automorphism(params, res, a, t);
+  return res;
+}
+
+void Add(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a,
+         const PolyMatrixNtt& b) {
+  WEAK_ENFORCE(res.Rows() == a.Rows());
+  WEAK_ENFORCE(res.Cols() == a.Cols());
+  WEAK_ENFORCE(a.Rows() == b.Rows());
+  WEAK_ENFORCE(a.Cols() == b.Cols());
+
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < a.Cols(); ++j) {
+      auto res_poly = res.Poly(i, j);
+      auto a_poly = a.Poly(i, j);
+      auto b_poly = b.Poly(i, j);
+      AddPoly(params, res_poly, a_poly, b_poly);
+    }
+  }
+}
+
+PolyMatrixNtt Add(const Params& params, const PolyMatrixNtt& a,
+                  const PolyMatrixNtt& b) {
+  WEAK_ENFORCE(a.Rows() == b.Rows());
+  WEAK_ENFORCE(a.Cols() == b.Cols());
+
+  PolyMatrixNtt res(a);
+
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < a.Cols(); ++j) {
+      auto res_poly = res.Poly(i, j);
+      auto a_poly = a.Poly(i, j);
+      auto b_poly = b.Poly(i, j);
+      AddPoly(params, res_poly, a_poly, b_poly);
+    }
+  }
+  return res;
+}
+
+void AddInto(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a) {
+  WEAK_ENFORCE(res.Rows() == a.Rows());
+  WEAK_ENFORCE(res.Cols() == a.Cols());
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < a.Cols(); ++j) {
+      auto res_poly = res.Poly(i, j);
+      auto a_poly = a.Poly(i, j);
+      AddPolyInto(params, res_poly, a_poly);
+    }
+  }
+}
+
+void AddIntoAt(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a,
+               size_t t_row, size_t t_col) {
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < a.Cols(); ++j) {
+      auto res_poly = res.Poly(t_row + i, t_col + j);
+      auto a_poly = a.Poly(i, j);
+      AddPolyInto(params, res_poly, a_poly);
+    }
+  }
+}
+
+void Invert(const Params& params, PolyMatrixRaw& res, const PolyMatrixRaw& a) {
+  WEAK_ENFORCE(res.Rows() == a.Rows());
+  WEAK_ENFORCE(res.Cols() == a.Cols());
+
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < a.Cols(); ++j) {
+      auto res_poly = res.Poly(i, j);
+      auto a_poly = a.Poly(i, j);
+      InvertPoly(params, res_poly, a_poly);
+    }
+  }
+}
+
+PolyMatrixRaw Invert(const Params& params, const PolyMatrixRaw& a) {
+  PolyMatrixRaw res(params.PolyLen(), a.Rows(), a.Cols());
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < a.Cols(); ++j) {
+      auto res_poly = res.Poly(i, j);
+      auto a_poly = a.Poly(i, j);
+      InvertPoly(params, res_poly, a_poly);
+    }
+  }
+  return res;
+}
+
+void FromNtt(const Params& params, PolyMatrixRaw& out,
+             const PolyMatrixNtt& in) {
+  WEAK_ENFORCE(out.Rows() == in.Rows());
+  WEAK_ENFORCE(out.Cols() == in.Cols());
+
+  for (size_t r = 0; r < out.Rows(); ++r) {
+    for (size_t c = 0; c < out.Cols(); ++c) {
+      // get cur ntt poly
+      auto in_poly = in.Poly(r, c);
+      // deep copy into another vector to avoid change the in matrix
+      std::vector<uint64_t> temp(in_poly.begin(), in_poly.end());
+      arith::NttInverse(params, absl::MakeSpan(temp));
+      size_t raw_poly_idx = out.PolyStartIndex(r, c);
+      // compose
+      for (size_t i = 0; i < params.PolyLen(); ++i) {
+        out.Data()[raw_poly_idx + i] = params.CrtCompose(temp, i);
+      }
+    }
+  }
+}
+
+PolyMatrixRaw FromNtt(const Params& params, const PolyMatrixNtt& in) {
+  PolyMatrixRaw res =
+      PolyMatrixRaw::Zero(params.PolyLen(), in.Rows(), in.Cols());
+  FromNtt(params, res, in);
+  return res;
+}
+
+void ToNtt(const Params& params, PolyMatrixNtt& out, const PolyMatrixRaw& in) {
+  for (size_t r = 0; r < out.Rows(); ++r) {
+    for (size_t c = 0; c < out.Cols(); ++c) {
+      auto in_poly = in.Poly(r, c);
+      auto out_poly = out.Poly(r, c);
+      ReduceCopy(params, out_poly, in_poly);
+      arith::NttForward(params, out_poly);
+    }
+  }
+}
+
+PolyMatrixNtt ToNtt(const Params& params, const PolyMatrixRaw& in) {
+  PolyMatrixNtt out = PolyMatrixNtt::Zero(params.CrtCount(), params.PolyLen(),
+                                          in.Rows(), in.Cols());
+  ToNtt(params, out, in);
+  return out;
+}
+
+void ToNttNoReduce(const Params& params, PolyMatrixNtt& out,
+                   const PolyMatrixRaw& in) {
+  for (size_t r = 0; r < out.Rows(); ++r) {
+    for (size_t c = 0; c < out.Cols(); ++c) {
+      auto in_poly = in.Poly(r, c);
+      auto out_poly = out.Poly(r, c);
+      // copy in_poly into 2-RNS moduli
+      std::memcpy(out_poly.data(), in_poly.data(),
+                  in_poly.size() * sizeof(uint64_t));
+
+      std::memcpy(out_poly.data() + in_poly.size(), in_poly.data(),
+                  in_poly.size() * sizeof(uint64_t));
+      // NTT
+      arith::NttForward(params, out_poly);
+    }
+  }
+}
+
+void Multiply(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a,
+              const PolyMatrixNtt& b) {
+  WEAK_ENFORCE(res.Rows() == a.Rows());
+  WEAK_ENFORCE(res.Cols() == b.Cols());
+  WEAK_ENFORCE(a.Cols() == b.Rows());
+
+  WEAK_ENFORCE(res.NumWords() == a.NumWords());
+  WEAK_ENFORCE(a.NumWords() == b.NumWords());
+
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < b.Cols(); ++j) {
+      auto res_poly = res.Poly(i, j);
+      std::fill(res_poly.begin(), res_poly.end(), 0);
+      for (size_t k = 0; k < a.Cols(); ++k) {
+        auto a_poly = a.Poly(i, k);
+        auto b_poly = b.Poly(k, j);
+        MultiplyAddPoly(params, res_poly, a_poly, b_poly);
+      }
+    }
+  }
+}
+
+PolyMatrixNtt Multiply(const Params& params, const PolyMatrixNtt& a,
+                       const PolyMatrixNtt& b) {
+  WEAK_ENFORCE(a.Cols() == b.Rows());
+  WEAK_ENFORCE(a.NumWords() == b.NumWords());
+
+  PolyMatrixNtt res =
+      PolyMatrixNtt::Zero(a.CrtCount(), a.PolyLen(), a.Rows(), b.Cols());
+
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < b.Cols(); ++j) {
+      auto res_poly = res.Poly(i, j);
+      std::fill(res_poly.begin(), res_poly.end(), 0);
+      for (size_t k = 0; k < a.Cols(); ++k) {
+        auto a_poly = a.Poly(i, k);
+        auto b_poly = b.Poly(k, j);
+        MultiplyAddPoly(params, res_poly, a_poly, b_poly);
+      }
+    }
+  }
+
+  return res;
+}
+
+PolyMatrixRaw MatrixWithIdentity(const PolyMatrixRaw& p) {
+  WEAK_ENFORCE(p.Cols() == 1U);
+
+  auto r = PolyMatrixRaw::Zero(p.PolyLen(), p.Rows(), p.Rows() + 1);
+  // copy p to r
+  r.CopyInto(p, 0, 0);
+  // concatenate a identity matrixa
+  auto identity = PolyMatrixRaw::Identity(p.PolyLen(), p.Rows(), p.Rows());
+  r.CopyInto(identity, 0, 1);
+  return r;
+}
+
+}  // namespace psi::spiral
diff --git a/psi/algorithm/spiral/poly_matrix.h b/psi/algorithm/spiral/poly_matrix.h
new file mode 100644
index 0000000..57beac4
--- /dev/null
+++ b/psi/algorithm/spiral/poly_matrix.h
@@ -0,0 +1,389 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "seal/modulus.h"
+#include "yacl/crypto/tools/prg.h"
+
+#include "psi/algorithm/spiral/params.h"
+
+namespace psi::spiral {
+
+// forward declaration
+class PolyMatrixNtt;
+
+class PolyMatrixRaw {
+ public:
+  PolyMatrixRaw(size_t poly_len, size_t rows, size_t cols,
+                std::vector<uint64_t> data)
+      : poly_len_(poly_len), rows_(rows), cols_(cols) {
+    YACL_ENFORCE(poly_len * rows * cols <= data.size());
+    data_ = std::move(data);
+  }
+
+  // zero matrix
+  PolyMatrixRaw(size_t poly_len, size_t rows, size_t cols)
+      : poly_len_(poly_len),
+        rows_(rows),
+        cols_(cols),
+        data_(rows * cols * poly_len) {}
+
+  PolyMatrixRaw() = default;
+
+  PolyMatrixRaw(const PolyMatrixRaw& other) {
+    if (&other != this) {
+      poly_len_ = other.poly_len_;
+      rows_ = other.rows_;
+      cols_ = other.cols_;
+      data_ = other.data_;
+    }
+  }
+
+  PolyMatrixRaw& operator=(const PolyMatrixRaw& other) {
+    if (this != &other) {
+      poly_len_ = other.poly_len_;
+      rows_ = other.rows_;
+      cols_ = other.cols_;
+      data_ = other.data_;
+    }
+
+    return *this;
+  }
+
+  PolyMatrixRaw(PolyMatrixRaw&& other) noexcept {
+    if (this != &other) {
+      poly_len_ = other.poly_len_;
+      rows_ = other.rows_;
+      cols_ = other.cols_;
+      data_ = std::move(other.data_);
+    }
+  }
+
+  PolyMatrixRaw& operator=(PolyMatrixRaw&& other) noexcept {
+    if (this != &other) {
+      poly_len_ = other.poly_len_;
+      rows_ = other.rows_;
+      cols_ = other.cols_;
+      data_ = std::move(other.data_);
+    }
+    return *this;
+  }
+
+  bool operator==(const PolyMatrixRaw& other) const {
+    return poly_len_ == other.poly_len_ && rows_ == other.rows_ &&
+           cols_ == other.cols_ && data_ == other.data_;
+  }
+
+  bool operator!=(const PolyMatrixRaw& other) const {
+    return !(*this == other);
+  }
+
+  bool IsNtt() const { return false; };
+
+  size_t Rows() const { return rows_; }
+
+  size_t Cols() const { return cols_; }
+
+  size_t Rows() { return rows_; }
+
+  size_t Cols() { return cols_; }
+
+  size_t PolyLen() const { return poly_len_; }
+
+  size_t PolyLen() { return poly_len_; }
+
+  size_t NumWords() const { return poly_len_; }
+  size_t NumWords() { return poly_len_; }
+
+  size_t PolyStartIndex(size_t row, size_t col) const {
+    return (row * cols_ + col) * NumWords();
+  }
+
+  absl::Span<uint64_t> Data() { return absl::MakeSpan(data_); }
+
+  absl::Span<const uint64_t> Data() const { return absl::MakeSpan(data_); }
+
+  void SetData(std::vector<uint64_t>& data) {
+    YACL_ENFORCE_EQ(rows_ * cols_ * NumWords(), data.size());
+    std::memcpy(data_.data(), data.data(), sizeof(uint64_t) * data.size());
+  }
+
+  // PolyMatrix operators
+  void CopyInto(const PolyMatrixRaw& p, size_t target_row, size_t target_col);
+
+  PolyMatrixRaw SubMatrix(size_t target_row, size_t target_col, size_t rows,
+                          size_t cols) const;
+
+  PolyMatrixRaw PadTop(size_t pad_rows);
+
+  absl::Span<uint64_t> Poly(size_t r, size_t c);
+  absl::Span<const uint64_t> Poly(size_t r, size_t c) const;
+
+  void ReduceMod(uint64_t modulus);
+  void ReduceMod(const seal::Modulus& modulus);
+
+  void Reset(std::shared_ptr<Params> params, size_t rows, size_t cols);
+  void Reset(size_t poly_len, size_t rows, size_t cols);
+
+  void Rescale(size_t start_row, size_t end_row, uint64_t in_modulus,
+               uint64_t out_modulus);
+
+  PolyMatrixRaw operator-() = delete;
+
+  // static methods without params
+  static PolyMatrixRaw Zero(size_t poly_len, size_t rows, size_t cols) {
+    return PolyMatrixRaw(poly_len, rows, cols);
+  }
+
+  static PolyMatrixRaw Identity(size_t poly_len, size_t rows, size_t cols);
+
+  static PolyMatrixRaw SingleValue(size_t poly_len, uint64_t value);
+
+  static PolyMatrixRaw Random(const Params& params, size_t rows, size_t cols);
+
+  static PolyMatrixRaw RandomPrg(const Params& params, size_t rows, size_t cols,
+                                 yacl::crypto::Prg<uint64_t>& prg);
+
+ private:
+  size_t poly_len_ = 0;
+  size_t rows_ = 0;
+  size_t cols_ = 0;
+  std::vector<std::uint64_t> data_;
+};
+
+class PolyMatrixNtt {
+ public:
+  PolyMatrixNtt(size_t crt_count, size_t poly_len, size_t rows, size_t cols,
+                std::vector<uint64_t>&& data)
+      : crt_count_(crt_count), poly_len_(poly_len), rows_(rows), cols_(cols) {
+    WEAK_ENFORCE(poly_len * crt_count * rows * cols <= data.size());
+    data_ = std::move(data);
+  }
+
+  PolyMatrixNtt(size_t crt_count, size_t poly_len, size_t rows, size_t cols)
+      : crt_count_(crt_count),
+        poly_len_(poly_len),
+        rows_(rows),
+        cols_(cols),
+        data_(rows * cols * poly_len * crt_count) {}
+
+  PolyMatrixNtt() = default;
+
+  PolyMatrixNtt(const PolyMatrixNtt& other) {
+    if (this != &other) {
+      crt_count_ = other.crt_count_;
+      poly_len_ = other.poly_len_;
+      rows_ = other.rows_;
+      cols_ = other.cols_;
+      data_ = other.data_;
+    }
+  }
+
+  PolyMatrixNtt& operator=(const PolyMatrixNtt& other) {
+    if (this != &other) {
+      crt_count_ = other.crt_count_;
+      poly_len_ = other.poly_len_;
+      rows_ = other.rows_;
+      cols_ = other.cols_;
+      data_ = other.data_;
+    }
+    return *this;
+  }
+
+  PolyMatrixNtt(PolyMatrixNtt&& other) noexcept {
+    if (this != &other) {
+      crt_count_ = other.crt_count_;
+      poly_len_ = other.poly_len_;
+      rows_ = other.rows_;
+      cols_ = other.cols_;
+      data_ = std::move(other.data_);
+    }
+  }
+
+  PolyMatrixNtt& operator=(PolyMatrixNtt&& other) noexcept {
+    if (this != &other) {
+      crt_count_ = other.crt_count_;
+      poly_len_ = other.poly_len_;
+      rows_ = other.rows_;
+      cols_ = other.cols_;
+      data_ = std::move(other.data_);
+    }
+    return *this;
+  }
+
+  bool operator==(const PolyMatrixNtt& other) const {
+    return crt_count_ == other.crt_count_ && rows_ == other.rows_ &&
+           cols_ == other.cols_ && data_ == other.data_;
+  }
+
+  bool operator!=(const PolyMatrixNtt& other) const {
+    return !(*this == other);
+  }
+
+  void SetData(std::vector<uint64_t>& data) {
+    YACL_ENFORCE_EQ(rows_ * cols_ * NumWords(), data.size());
+    std::memcpy(data_.data(), data.data(), sizeof(uint64_t) * data.size());
+  }
+
+  PolyMatrixNtt operator*(const PolyMatrixNtt& other) = delete;
+
+  PolyMatrixNtt operator+(const PolyMatrixNtt& other) = delete;
+
+  bool IsNtt() const { return true; };
+  size_t Rows() const { return rows_; }
+  size_t Cols() const { return cols_; }
+  size_t Rows() { return rows_; }
+  size_t Cols() { return cols_; }
+
+  size_t NumWords() const { return crt_count_ * poly_len_; };
+
+  size_t PolyStartIndex(size_t row, size_t col) const {
+    return (row * cols_ + col) * NumWords();
+  }
+
+  absl::Span<uint64_t> Data() { return absl::MakeSpan(data_); }
+
+  absl::Span<const uint64_t> Data() const { return absl::MakeSpan(data_); }
+
+  void CopyInto(const PolyMatrixNtt& p, size_t target_row, size_t target_col);
+
+  PolyMatrixNtt SubMatrix(size_t target_row, size_t target_col, size_t rows,
+                          size_t cols) const;
+
+  PolyMatrixNtt PadTop(size_t pad_rows);
+
+  absl::Span<uint64_t> Poly(size_t r, size_t c);
+
+  absl::Span<const uint64_t> Poly(size_t r, size_t c) const;
+
+  void Reset(std::shared_ptr<Params> params, size_t rows, size_t cols);
+
+  // static methdos without params
+  static PolyMatrixNtt Zero(size_t crt_count, size_t poly_len, size_t rows,
+                            size_t cols) {
+    return PolyMatrixNtt(crt_count, poly_len, rows, cols);
+  }
+
+  static PolyMatrixNtt Random(const Params& params, size_t rows, size_t cols);
+
+  static PolyMatrixNtt RandomPrg(const Params& params, size_t rows, size_t cols,
+                                 yacl::crypto::Prg<uint64_t>& prg);
+
+  // static methods
+  static PolyMatrixNtt Zero(std::shared_ptr<Params> params, size_t rows,
+                            size_t cols);
+
+  static PolyMatrixNtt Random(std::shared_ptr<Params> params, size_t rows,
+                              size_t cols);
+
+  static PolyMatrixNtt RandomPrg(std::shared_ptr<Params> params, size_t rows,
+                                 size_t cols, yacl::crypto::Prg<uint64_t>& prg);
+
+  size_t CrtCount() { return crt_count_; }
+  size_t PolyLen() { return poly_len_; }
+
+  size_t CrtCount() const { return crt_count_; }
+  size_t PolyLen() const { return poly_len_; }
+
+ private:
+  size_t crt_count_ = 0;
+  size_t poly_len_ = 0;
+  size_t rows_ = 0;
+  size_t cols_ = 0;
+  std::vector<std::uint64_t> data_;
+};
+
+//----some utils method for PolyMatrix-----
+
+// Poly operators
+
+// res = a * b
+void MultiplyPoly(const Params& params, absl::Span<uint64_t> res,
+                  absl::Span<const uint64_t> a, absl::Span<const uint64_t> b);
+
+// res += (a * b)
+void MultiplyAddPoly(const Params& params, absl::Span<uint64_t> res,
+                     absl::Span<const uint64_t> a,
+                     absl::Span<const uint64_t> b);
+// res = a + b
+void AddPoly(const Params& params, absl::Span<uint64_t> res,
+             absl::Span<const uint64_t> a, absl::Span<const uint64_t> b);
+// res += a
+void AddPolyInto(const Params& params, absl::Span<const uint64_t> res,
+                 absl::Span<const uint64_t> a);
+// res = -a
+void InvertPoly(const Params& params, absl::Span<uint64_t> res,
+                absl::Span<const uint64_t> a);
+
+void AutomotphPoly(const Params& params, absl::Span<uint64_t> res,
+                   absl::Span<const uint64_t> a, size_t t);
+
+// in is a PolyRaw, res is a RNS
+void ReduceCopy(const Params& params, absl::Span<uint64_t> res,
+                absl::Span<const uint64_t> in);
+
+void ReducePoly(const Params& params, absl::Span<uint64_t> res);
+
+// matrix operator
+
+PolyMatrixNtt ShiftRowsByOne(const PolyMatrixNtt& in);
+
+PolyMatrixRaw Stack(const PolyMatrixRaw& a, const PolyMatrixRaw& b);
+PolyMatrixNtt StackNtt(const PolyMatrixNtt& a, const PolyMatrixNtt& b);
+
+// a is a 1x1 matrix, view as a scalar
+void ScalarMultiply(const Params& params, PolyMatrixNtt& res,
+                    const PolyMatrixNtt& a, const PolyMatrixNtt& b);
+PolyMatrixNtt ScalarMultiply(const Params& params, const PolyMatrixNtt& a,
+                             const PolyMatrixNtt& b);
+
+void Multiply(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a,
+              const PolyMatrixNtt& b);
+
+PolyMatrixNtt Multiply(const Params& params, const PolyMatrixNtt& a,
+                       const PolyMatrixNtt& b);
+
+void Automorphism(const Params& params, PolyMatrixRaw& res,
+                  const PolyMatrixRaw& a, size_t t);
+PolyMatrixRaw Automorphism(const Params& params, const PolyMatrixRaw& a,
+                           size_t t);
+// res = a + b
+void Add(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a,
+         const PolyMatrixNtt& b);
+PolyMatrixNtt Add(const Params& params, const PolyMatrixNtt& a,
+                  const PolyMatrixNtt& b);
+
+// res += a
+void AddInto(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a);
+void AddIntoAt(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a,
+               size_t t_row, size_t t_col);
+
+void Invert(const Params& params, PolyMatrixRaw& res, const PolyMatrixRaw& a);
+PolyMatrixRaw Invert(const Params& params, const PolyMatrixRaw& a);
+
+void FromNtt(const Params& params, PolyMatrixRaw& out, const PolyMatrixNtt& in);
+PolyMatrixRaw FromNtt(const Params& params, const PolyMatrixNtt& in);
+
+void ToNtt(const Params& params, PolyMatrixNtt& out, const PolyMatrixRaw& in);
+PolyMatrixNtt ToNtt(const Params& params, const PolyMatrixRaw& in);
+void ToNttNoReduce(const Params& params, PolyMatrixNtt& out,
+                   const PolyMatrixRaw& in);
+
+PolyMatrixRaw MatrixWithIdentity(const PolyMatrixRaw& p);
+
+}  // namespace psi::spiral
diff --git a/psi/algorithm/spiral/poly_matrix_test.cc b/psi/algorithm/spiral/poly_matrix_test.cc
new file mode 100644
index 0000000..c92f6bb
--- /dev/null
+++ b/psi/algorithm/spiral/poly_matrix_test.cc
@@ -0,0 +1,213 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/poly_matrix.h"
+
+#ifdef __x86_64__
+#include <immintrin.h>
+#elif defined(__aarch64__)
+#include "sse2neon.h"
+#endif
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "gtest/gtest.h"
+#include "spdlog/spdlog.h"
+#include "yacl/utils/elapsed_timer.h"
+
+#include "psi/algorithm/spiral/arith/arith_params.h"
+#include "psi/algorithm/spiral/common.h"
+#include "psi/algorithm/spiral/params.h"
+#include "psi/algorithm/spiral/poly_matrix_utils.h"
+#include "psi/algorithm/spiral/util.h"
+
+namespace psi::spiral {
+
+namespace {
+
+constexpr size_t kMaxLoop = 10;
+
+}  // namespace
+
+TEST(PolyMatrixRaw, Zero) {
+  auto params = util::GetFastExpansionTestingParam();
+  PolyMatrixRaw poly = PolyMatrixRaw::Zero(params.PolyLen(), 2, 1);
+  auto data = poly.Data();
+  ASSERT_TRUE(std::all_of(data.begin(), data.end(),
+                          [](uint64_t value) { return value == 0; }));
+}
+
+TEST(PolyMatrixRaw, CopyInto) {
+  auto params = util::GetFastExpansionTestingParam();
+
+  PolyMatrixRaw poly = PolyMatrixRaw::Zero(params.PolyLen(), 2, 1);
+  std::vector<uint64_t> data(params.PolyLen() * 2 * 1, 1);
+  PolyMatrixRaw poly2(params.PolyLen(), 2, 1, std::move(data));
+
+  ASSERT_TRUE(poly != poly2);
+
+  poly.CopyInto(poly2, 0, 0);
+  ASSERT_TRUE(poly == poly2);
+}
+
+TEST(PolyMatrixRaw, SubMatrix) {
+  auto params = util::GetFastExpansionTestingParam();
+  std::vector<uint64_t> data(params.PolyLen() * 2 * 1, 1);
+  PolyMatrixRaw poly(params.PolyLen(), 2, 1, std::move(data));
+
+  auto sub = poly.SubMatrix(0, 0, 2, 1);
+  ASSERT_TRUE(poly == sub);
+
+  auto sub2 = poly.SubMatrix(0, 0, 1, 1);
+  ASSERT_TRUE(poly != sub2);
+}
+
+TEST(PolyMatrixRaw, PadTop) {
+  auto params = util::GetFastExpansionTestingParam();
+
+  PolyMatrixRaw poly = PolyMatrixRaw::Zero(params.PolyLen(), 4, 1);
+  PolyMatrixRaw poly2 = PolyMatrixRaw::Zero(params.PolyLen(), 2, 1);
+  auto padded = poly2.PadTop(2);
+
+  ASSERT_TRUE(padded == poly);
+}
+
+TEST(PolyMatrixRaw, Identity) {
+  auto params = util::GetFastExpansionTestingParam();
+
+  PolyMatrixRaw poly = PolyMatrixRaw::Identity(params.PolyLen(), 2, 2);
+  // verify
+  std::vector<uint64_t> zero(params.PolyLen(), 0);
+  std::vector<uint64_t> one(params.PolyLen(), 0);
+  one[0] = 1;
+
+  absl::Span<uint64_t> zero_span = absl::MakeSpan(zero);
+  absl::Span<uint64_t> one_span = absl::MakeSpan(one);
+  for (size_t r = 0; r < 2; ++r) {
+    for (size_t c = 0; c < 2; ++c) {
+      if (r == c) {
+        ASSERT_TRUE(poly.Poly(r, c) == one_span);
+      } else {
+        ASSERT_TRUE(poly.Poly(r, c) == zero_span);
+      }
+    }
+  }
+}
+
+TEST(PolyMatrixRaw, Reset) {
+  auto params = util::GetFastExpansionTestingParam();
+
+  PolyMatrixRaw poly = PolyMatrixRaw::Identity(params.PolyLen(), 1, 1);
+  poly.Reset(params.PolyLen(), 2, 1);
+  PolyMatrixRaw poly2 = PolyMatrixRaw::Zero(params.PolyLen(), 2, 1);
+
+  ASSERT_TRUE(poly == poly2);
+}
+
+TEST(PolyMatrixRaw, SingleValue) {
+  auto params = util::GetFastExpansionTestingParam();
+
+  PolyMatrixRaw poly = PolyMatrixRaw::SingleValue(params.PolyLen(), 1024);
+  ASSERT_EQ(poly.Rows(), 1);
+  ASSERT_EQ(poly.Cols(), 1);
+  ASSERT_EQ(poly.Data()[0], 1024);
+}
+
+TEST(PolyMatrixRaw, Clone) {
+  auto params = util::GetFastExpansionTestingParam();
+  PolyMatrixRaw poly = PolyMatrixRaw::Random(params, 2, 1);
+
+  PolyMatrixRaw poly2(poly);
+  ASSERT_TRUE(poly2 == poly);
+
+  PolyMatrixRaw poly3 = poly;
+  ASSERT_TRUE(poly3 == poly);
+
+  PolyMatrixRaw poly4;
+  poly4 = poly;
+  ASSERT_TRUE(poly4 == poly);
+}
+
+TEST(PolyMatrixNtt, NumWords) {
+  auto params = util::GetFastExpansionTestingParam();
+
+  PolyMatrixNtt poly =
+      PolyMatrixNtt::Zero(params.CrtCount(), params.PolyLen(), 2, 1);
+
+  ASSERT_EQ(poly.NumWords(), params.PolyLen() * params.CrtCount());
+}
+
+TEST(PolyMatrixNtt, MulZero) {
+  auto params = util::GetFastExpansionTestingParam();
+
+  PolyMatrixNtt m1 =
+      PolyMatrixNtt::Zero(params.CrtCount(), params.PolyLen(), 3, 2);
+  PolyMatrixNtt m2 = PolyMatrixNtt::Random(params, 2, 1);
+
+  auto m3 = Multiply(params, m1, m2);
+
+  for (size_t i = 0; i < m3.Data().size(); ++i) {
+    ASSERT_EQ(0, m3.Data()[i]);
+  }
+}
+
+TEST(PolyMatrixNtt, RawToNtt) {
+  auto params = util::GetFastExpansionTestingParam();
+
+  for (size_t i = 0; i < kMaxLoop; ++i) {
+    auto m = PolyMatrixRaw::Random(params, 2, 1);
+
+    auto m_ntt = ToNtt(params, m);
+
+    auto m_raw = FromNtt(params, m_ntt);
+
+    ASSERT_EQ(m, m_raw);
+  }
+}
+
+TEST(PolyMatrixNtt, NttToRaw) {
+  auto params = util::GetFastExpansionTestingParam();
+
+  for (size_t i = 0; i < kMaxLoop; ++i) {
+    auto m = PolyMatrixNtt::Random(params, 2, 1);
+    auto m_raw = FromNtt(params, m);
+    auto m_ntt = ToNtt(params, m_raw);
+    ASSERT_EQ(m, m_ntt);
+  }
+}
+
+TEST(PolyMatrixNtt, Multiply) {
+  auto params = util::GetFastExpansionTestingParam();
+
+  PolyMatrixRaw m1 = PolyMatrixRaw::Zero(params.PolyLen(), 1, 1);
+  PolyMatrixRaw m2 = PolyMatrixRaw::Zero(params.PolyLen(), 1, 1);
+
+  m1.Data()[m1.PolyStartIndex(0, 0) + 1] = 100;
+  m2.Data()[m2.PolyStartIndex(0, 0) + 1] = 7;
+
+  auto m1_ntt = ToNtt(params, m1);
+  auto m2_ntt = ToNtt(params, m2);
+  auto m3_ntt = Multiply(params, m1_ntt, m2_ntt);
+
+  auto m3 = FromNtt(params, m3_ntt);
+
+  ASSERT_EQ(700, m3.Data()[m3.PolyStartIndex(0, 0) + 2]);
+  // other coeff = 0
+  std::vector<uint64_t> expected(m3.Data().size(), 0);
+  expected[2] = 700;
+  ASSERT_EQ(m3.Data(), expected);
+}
+}  // namespace psi::spiral
diff --git a/psi/algorithm/spiral/poly_matrix_utils.cc b/psi/algorithm/spiral/poly_matrix_utils.cc
new file mode 100644
index 0000000..c4c4197
--- /dev/null
+++ b/psi/algorithm/spiral/poly_matrix_utils.cc
@@ -0,0 +1,437 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef __x86_64__
+#include <immintrin.h>
+#elif defined(__aarch64__)
+#include "sse2neon.h"
+#endif
+
+#include "absl/types/span.h"
+#include "seal/modulus.h"
+#include "yacl/base/exception.h"
+#include "yacl/crypto/rand/rand.h"
+
+#include "psi/algorithm/spiral/arith/arith_params.h"
+#include "psi/algorithm/spiral/arith/ntt.h"
+#include "psi/algorithm/spiral/poly_matrix.h"
+#include "psi/algorithm/spiral/util.h"
+
+namespace psi::spiral {
+
+//----some utils method for PolyMatrix-----
+
+void MultiplyPoly(const Params& params, absl::Span<uint64_t> res,
+                  absl::Span<const uint64_t> a, absl::Span<const uint64_t> b) {
+  // todo: try parallel
+  for (size_t c = 0; c < params.CrtCount(); ++c) {
+    for (size_t i = 0; i < params.PolyLen(); ++i) {
+      size_t idx = c * params.PolyLen() + i;
+      res[idx] = arith::MultiplyModular(params, a[idx], b[idx], c);
+    }
+  }
+}
+
+#ifndef __AVX2__
+
+void MultiplyAddPoly(const Params& params, absl::Span<uint64_t> res,
+                     absl::Span<const uint64_t> a,
+                     absl::Span<const uint64_t> b) {
+  WEAK_ENFORCE(res.size() == a.size());
+  WEAK_ENFORCE(res.size() == b.size());
+  WEAK_ENFORCE(res.size() == params.CrtCount() * params.PolyLen());
+  //
+  for (size_t c = 0; c < params.CrtCount(); ++c) {
+    for (size_t i = 0; i < params.PolyLen(); ++i) {
+      size_t idx = c * params.PolyLen() + i;
+      res[idx] = arith::MultiplyAddModular(params, a[idx], b[idx], res[idx], c);
+    }
+  }
+}
+
+#else
+
+void MultiplyAddPolyAvx2(const Params& params, absl::Span<uint64_t> res,
+                         absl::Span<const uint64_t> a,
+                         absl::Span<const uint64_t> b) {
+  WEAK_ENFORCE(res.size() == a.size());
+  WEAK_ENFORCE(res.size() == b.size());
+  WEAK_ENFORCE(res.size() == params.CrtCount() * params.PolyLen());
+
+  for (size_t c = 0; c < params.CrtCount(); ++c) {
+    for (size_t i = 0; i < params.PolyLen(); i += 4) {
+      const __m256i* p_x =
+          reinterpret_cast<const __m256i*>(&a[c * params.PolyLen() + i]);
+      const __m256i* p_y =
+          reinterpret_cast<const __m256i*>(&b[c * params.PolyLen() + i]);
+      __m256i* p_z = reinterpret_cast<__m256i*>(&res[c * params.PolyLen() + i]);
+
+      // Load the data into AVX2 registers
+      __m256i x = _mm256_loadu_si256(p_x);
+      __m256i y = _mm256_loadu_si256(p_y);
+      __m256i z = _mm256_loadu_si256(p_z);
+
+      // Perform the multiplication and addition
+      __m256i product = _mm256_mul_epu32(x, y);
+      __m256i out = _mm256_add_epi64(z, product);
+      _mm256_storeu_si256(p_z, out);
+    }
+  }
+}
+
+void MultiplyAddPoly(const Params& params, absl::Span<uint64_t> res,
+                     absl::Span<const uint64_t> a,
+                     absl::Span<const uint64_t> b) {
+  MultiplyAddPolyAvx2(params, res, a, b);
+  ReducePoly(params, res);
+}
+
+#endif
+
+void AddPoly(const Params& params, absl::Span<uint64_t> res,
+             absl::Span<const uint64_t> a, absl::Span<const uint64_t> b) {
+  for (size_t c = 0; c < params.CrtCount(); ++c) {
+    for (size_t i = 0; i < params.PolyLen(); ++i) {
+      size_t idx = c * params.PolyLen() + i;
+      res[idx] = arith::AddModular(params, a[idx], b[idx], c);
+    }
+  }
+}
+
+void AddPolyInto(const Params& params, absl::Span<uint64_t> res,
+                 absl::Span<const uint64_t> a) {
+  for (size_t c = 0; c < params.CrtCount(); ++c) {
+    for (size_t i = 0; i < params.PolyLen(); ++i) {
+      size_t idx = c * params.PolyLen() + i;
+      res[idx] = arith::AddModular(params, a[idx], res[idx], c);
+    }
+  }
+}
+
+void InvertPoly(const Params& params, absl::Span<uint64_t> res,
+                absl::Span<const uint64_t> a) {
+  for (size_t i = 0; i < params.PolyLen(); ++i) {
+    res[i] = params.Modulus() - a[i];
+  }
+}
+
+void AutomotphPoly(const Params& params, absl::Span<uint64_t> res,
+                   absl::Span<const uint64_t> a, size_t t) {
+  auto poly_len = params.PolyLen();
+  for (size_t i = 0; i < poly_len; ++i) {
+    uint64_t num = (i * t) / poly_len;
+    uint64_t rem = (i * t) % poly_len;
+    if (num % 2 == 0) {
+      res[rem] = a[i];
+    } else {
+      res[rem] = params.Modulus() - a[i];
+    }
+  }
+}
+
+void ReduceCopy(const Params& params, absl::Span<uint64_t> res,
+                absl::Span<const uint64_t> in) {
+  for (size_t i = 0; i < params.CrtCount(); ++i) {
+    for (size_t j = 0; j < params.PolyLen(); ++j) {
+      res[i * params.PolyLen() + j] = arith::BarrettCoeffU64(params, in[j], i);
+    }
+  }
+}
+
+void ReducePoly(const Params& params, absl::Span<uint64_t> res) {
+  WEAK_ENFORCE(res.size() == params.CrtCount() * params.PolyLen());
+  for (size_t c = 0; c < params.CrtCount(); ++c) {
+    for (size_t i = 0; i < params.PolyLen(); ++i) {
+      size_t idx = c * params.PolyLen() + i;
+      res[idx] = arith::BarrettCoeffU64(params, res[idx], c);
+    }
+  }
+}
+
+PolyMatrixNtt ShiftRowsByOne(const PolyMatrixNtt& in) {
+  if (in.Rows() == 1) {
+    return PolyMatrixNtt(in);
+  }
+  auto sub_rows = in.SubMatrix(0, 0, in.Rows() - 1, in.Cols());
+  auto last_row = in.SubMatrix(in.Rows() - 1, 0, 1, in.Cols());
+  auto out = StackNtt(last_row, sub_rows);
+  return out;
+}
+
+PolyMatrixNtt StackNtt(const PolyMatrixNtt& a, const PolyMatrixNtt& b) {
+  WEAK_ENFORCE(a.Cols() == b.Cols());
+  auto c = PolyMatrixNtt::Zero(a.CrtCount(), a.PolyLen(), a.Rows() + b.Rows(),
+                               a.Cols());
+  c.CopyInto(a, 0, 0);
+  c.CopyInto(b, a.Rows(), 0);
+  return c;
+}
+
+PolyMatrixRaw Stack(const PolyMatrixRaw& a, const PolyMatrixRaw& b) {
+  WEAK_ENFORCE(a.Cols() == b.Cols());
+  auto c = PolyMatrixRaw::Zero(a.PolyLen(), a.Rows() + b.Rows(), a.Cols());
+  c.CopyInto(a, 0, 0);
+  c.CopyInto(b, a.Rows(), 0);
+  return c;
+}
+
+void ScalarMultiply(const Params& params, PolyMatrixNtt& res,
+                    const PolyMatrixNtt& a, const PolyMatrixNtt& b) {
+  WEAK_ENFORCE(a.Rows() == 1 && a.Cols() == 1);
+  auto poly_a = a.Poly(0, 0);
+  for (size_t i = 0; i < b.Rows(); ++i) {
+    for (size_t j = 0; j < b.Cols(); ++j) {
+      auto poly_b = b.Poly(i, j);
+      auto poly_res = res.Poly(i, j);
+      // mul
+      MultiplyPoly(params, poly_res, poly_a, poly_b);
+    }
+  }
+}
+
+PolyMatrixNtt ScalarMultiply(const Params& params, const PolyMatrixNtt& a,
+                             const PolyMatrixNtt& b) {
+  PolyMatrixNtt res =
+      PolyMatrixNtt::Zero(b.CrtCount(), b.PolyLen(), b.Rows(), b.Cols());
+  ScalarMultiply(params, res, a, b);
+  return res;
+}
+
+void Automorphism(const Params& params, PolyMatrixRaw& res,
+                  const PolyMatrixRaw& a, size_t t) {
+  WEAK_ENFORCE(res.Rows() == a.Rows());
+  WEAK_ENFORCE(res.Cols() == a.Cols());
+
+  // handle each poly
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < a.Cols(); ++j) {
+      auto poly_a = a.Poly(i, j);
+      auto poly_res = res.Poly(i, j);
+      AutomotphPoly(params, poly_res, poly_a, t);
+    }
+  }
+}
+
+PolyMatrixRaw Automorphism(const Params& params, const PolyMatrixRaw& a,
+                           size_t t) {
+  PolyMatrixRaw res = PolyMatrixRaw::Zero(a.PolyLen(), a.Rows(), a.Cols());
+  Automorphism(params, res, a, t);
+  return res;
+}
+
+void Add(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a,
+         const PolyMatrixNtt& b) {
+  WEAK_ENFORCE(res.Rows() == a.Rows());
+  WEAK_ENFORCE(res.Cols() == a.Cols());
+  WEAK_ENFORCE(a.Rows() == b.Rows());
+  WEAK_ENFORCE(a.Cols() == b.Cols());
+
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < a.Cols(); ++j) {
+      auto res_poly = res.Poly(i, j);
+      auto a_poly = a.Poly(i, j);
+      auto b_poly = b.Poly(i, j);
+      AddPoly(params, res_poly, a_poly, b_poly);
+    }
+  }
+}
+
+PolyMatrixNtt Add(const Params& params, const PolyMatrixNtt& a,
+                  const PolyMatrixNtt& b) {
+  WEAK_ENFORCE(a.Rows() == b.Rows());
+  WEAK_ENFORCE(a.Cols() == b.Cols());
+
+  PolyMatrixNtt res(a);
+
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < a.Cols(); ++j) {
+      auto res_poly = res.Poly(i, j);
+      auto a_poly = a.Poly(i, j);
+      auto b_poly = b.Poly(i, j);
+      AddPoly(params, res_poly, a_poly, b_poly);
+    }
+  }
+  return res;
+}
+
+void AddInto(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a) {
+  WEAK_ENFORCE(res.Rows() == a.Rows());
+  WEAK_ENFORCE(res.Cols() == a.Cols());
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < a.Cols(); ++j) {
+      auto res_poly = res.Poly(i, j);
+      auto a_poly = a.Poly(i, j);
+      AddPolyInto(params, res_poly, a_poly);
+    }
+  }
+}
+
+void AddIntoAt(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a,
+               size_t t_row, size_t t_col) {
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < a.Cols(); ++j) {
+      auto res_poly = res.Poly(t_row + i, t_col + j);
+      auto a_poly = a.Poly(i, j);
+      AddPolyInto(params, res_poly, a_poly);
+    }
+  }
+}
+
+void Invert(const Params& params, PolyMatrixRaw& res, const PolyMatrixRaw& a) {
+  WEAK_ENFORCE(res.Rows() == a.Rows());
+  WEAK_ENFORCE(res.Cols() == a.Cols());
+
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < a.Cols(); ++j) {
+      auto res_poly = res.Poly(i, j);
+      auto a_poly = a.Poly(i, j);
+      InvertPoly(params, res_poly, a_poly);
+    }
+  }
+}
+
+PolyMatrixRaw Invert(const Params& params, const PolyMatrixRaw& a) {
+  PolyMatrixRaw res(params.PolyLen(), a.Rows(), a.Cols());
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < a.Cols(); ++j) {
+      auto res_poly = res.Poly(i, j);
+      auto a_poly = a.Poly(i, j);
+      InvertPoly(params, res_poly, a_poly);
+    }
+  }
+  return res;
+}
+
+void FromNtt(const Params& params, PolyMatrixRaw& out,
+             const PolyMatrixNtt& in) {
+  WEAK_ENFORCE(out.Rows() == in.Rows());
+  WEAK_ENFORCE(out.Cols() == in.Cols());
+
+  for (size_t r = 0; r < out.Rows(); ++r) {
+    for (size_t c = 0; c < out.Cols(); ++c) {
+      // get cur ntt poly
+      auto in_poly = in.Poly(r, c);
+      // deep copy into another vector to avoid change the in matrix
+      std::vector<uint64_t> temp(in_poly.begin(), in_poly.end());
+      arith::NttInverse(params, absl::MakeSpan(temp));
+      size_t raw_poly_idx = out.PolyStartIndex(r, c);
+      // compose
+      for (size_t i = 0; i < params.PolyLen(); ++i) {
+        out.Data()[raw_poly_idx + i] = params.CrtCompose(temp, i);
+      }
+    }
+  }
+}
+
+PolyMatrixRaw FromNtt(const Params& params, const PolyMatrixNtt& in) {
+  PolyMatrixRaw res =
+      PolyMatrixRaw::Zero(params.PolyLen(), in.Rows(), in.Cols());
+  FromNtt(params, res, in);
+  return res;
+}
+
+void ToNtt(const Params& params, PolyMatrixNtt& out, const PolyMatrixRaw& in) {
+  for (size_t r = 0; r < out.Rows(); ++r) {
+    for (size_t c = 0; c < out.Cols(); ++c) {
+      auto in_poly = in.Poly(r, c);
+      auto out_poly = out.Poly(r, c);
+      ReduceCopy(params, out_poly, in_poly);
+      arith::NttForward(params, out_poly);
+    }
+  }
+}
+
+PolyMatrixNtt ToNtt(const Params& params, const PolyMatrixRaw& in) {
+  PolyMatrixNtt out = PolyMatrixNtt::Zero(params.CrtCount(), params.PolyLen(),
+                                          in.Rows(), in.Cols());
+  ToNtt(params, out, in);
+  return out;
+}
+
+void ToNttNoReduce(const Params& params, PolyMatrixNtt& out,
+                   const PolyMatrixRaw& in) {
+  for (size_t r = 0; r < out.Rows(); ++r) {
+    for (size_t c = 0; c < out.Cols(); ++c) {
+      auto in_poly = in.Poly(r, c);
+      auto out_poly = out.Poly(r, c);
+      // copy in_poly into 2-RNS moduli
+      std::memcpy(out_poly.data(), in_poly.data(),
+                  in_poly.size() * sizeof(uint64_t));
+
+      std::memcpy(out_poly.data() + in_poly.size(), in_poly.data(),
+                  in_poly.size() * sizeof(uint64_t));
+      // NTT
+      arith::NttForward(params, out_poly);
+    }
+  }
+}
+
+void Multiply(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a,
+              const PolyMatrixNtt& b) {
+  WEAK_ENFORCE(res.Rows() == a.Rows());
+  WEAK_ENFORCE(res.Cols() == b.Cols());
+  WEAK_ENFORCE(a.Cols() == b.Rows());
+
+  WEAK_ENFORCE(res.NumWords() == a.NumWords());
+  WEAK_ENFORCE(a.NumWords() == b.NumWords());
+
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < b.Cols(); ++j) {
+      auto res_poly = res.Poly(i, j);
+      std::fill(res_poly.begin(), res_poly.end(), 0);
+      for (size_t k = 0; k < a.Cols(); ++k) {
+        auto a_poly = a.Poly(i, k);
+        auto b_poly = b.Poly(k, j);
+        MultiplyAddPoly(params, res_poly, a_poly, b_poly);
+      }
+    }
+  }
+}
+
+PolyMatrixNtt Multiply(const Params& params, const PolyMatrixNtt& a,
+                       const PolyMatrixNtt& b) {
+  WEAK_ENFORCE(a.Cols() == b.Rows());
+  WEAK_ENFORCE(a.NumWords() == b.NumWords());
+
+  PolyMatrixNtt res =
+      PolyMatrixNtt::Zero(a.CrtCount(), a.PolyLen(), a.Rows(), b.Cols());
+
+  for (size_t i = 0; i < a.Rows(); ++i) {
+    for (size_t j = 0; j < b.Cols(); ++j) {
+      auto res_poly = res.Poly(i, j);
+      std::fill(res_poly.begin(), res_poly.end(), 0);
+      for (size_t k = 0; k < a.Cols(); ++k) {
+        auto a_poly = a.Poly(i, k);
+        auto b_poly = b.Poly(k, j);
+        MultiplyAddPoly(params, res_poly, a_poly, b_poly);
+      }
+    }
+  }
+
+  return res;
+}
+
+PolyMatrixRaw MatrixWithIdentity(const PolyMatrixRaw& p) {
+  WEAK_ENFORCE(p.Cols() == 1U);
+
+  auto r = PolyMatrixRaw::Zero(p.PolyLen(), p.Rows(), p.Rows() + 1);
+  // copy p to r
+  r.CopyInto(p, 0, 0);
+  // concatenate a identity matrixa
+  auto identity = PolyMatrixRaw::Identity(p.PolyLen(), p.Rows(), p.Rows());
+  r.CopyInto(identity, 0, 1);
+  return r;
+}
+
+}  // namespace psi::spiral
diff --git a/psi/algorithm/spiral/poly_matrix_utils.h b/psi/algorithm/spiral/poly_matrix_utils.h
new file mode 100644
index 0000000..6e77d9e
--- /dev/null
+++ b/psi/algorithm/spiral/poly_matrix_utils.h
@@ -0,0 +1,105 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "seal/modulus.h"
+#include "yacl/crypto/tools/prg.h"
+
+#include "psi/algorithm/spiral/params.h"
+
+namespace psi::spiral {
+
+//----some utils method for PolyMatrix-----
+
+// Poly operators
+
+// res = a * b
+void MultiplyPoly(const Params& params, absl::Span<uint64_t> res,
+                  absl::Span<const uint64_t> a, absl::Span<const uint64_t> b);
+
+// res += (a * b)
+void MultiplyAddPoly(const Params& params, absl::Span<uint64_t> res,
+                     absl::Span<const uint64_t> a,
+                     absl::Span<const uint64_t> b);
+// res = a + b
+void AddPoly(const Params& params, absl::Span<uint64_t> res,
+             absl::Span<const uint64_t> a, absl::Span<const uint64_t> b);
+// res += a
+void AddPolyInto(const Params& params, absl::Span<const uint64_t> res,
+                 absl::Span<const uint64_t> a);
+// res = -a
+void InvertPoly(const Params& params, absl::Span<uint64_t> res,
+                absl::Span<const uint64_t> a);
+
+void AutomotphPoly(const Params& params, absl::Span<uint64_t> res,
+                   absl::Span<const uint64_t> a, size_t t);
+
+// in is a PolyRaw, res is a RNS
+void ReduceCopy(const Params& params, absl::Span<uint64_t> res,
+                absl::Span<const uint64_t> in);
+
+void ReducePoly(const Params& params, absl::Span<uint64_t> res);
+
+// matrix operator
+
+PolyMatrixNtt ShiftRowsByOne(const PolyMatrixNtt& in);
+
+PolyMatrixRaw Stack(const PolyMatrixRaw& a, const PolyMatrixRaw& b);
+PolyMatrixNtt StackNtt(const PolyMatrixNtt& a, const PolyMatrixNtt& b);
+
+// a is a 1x1 matrix, view as a scalar
+void ScalarMultiply(const Params& params, PolyMatrixNtt& res,
+                    const PolyMatrixNtt& a, const PolyMatrixNtt& b);
+PolyMatrixNtt ScalarMultiply(const Params& params, const PolyMatrixNtt& a,
+                             const PolyMatrixNtt& b);
+
+void Multiply(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a,
+              const PolyMatrixNtt& b);
+
+PolyMatrixNtt Multiply(const Params& params, const PolyMatrixNtt& a,
+                       const PolyMatrixNtt& b);
+
+void Automorphism(const Params& params, PolyMatrixRaw& res,
+                  const PolyMatrixRaw& a, size_t t);
+PolyMatrixRaw Automorphism(const Params& params, const PolyMatrixRaw& a,
+                           size_t t);
+// res = a + b
+void Add(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a,
+         const PolyMatrixNtt& b);
+PolyMatrixNtt Add(const Params& params, const PolyMatrixNtt& a,
+                  const PolyMatrixNtt& b);
+
+// res += a
+void AddInto(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a);
+void AddIntoAt(const Params& params, PolyMatrixNtt& res, const PolyMatrixNtt& a,
+               size_t t_row, size_t t_col);
+
+void Invert(const Params& params, PolyMatrixRaw& res, const PolyMatrixRaw& a);
+PolyMatrixRaw Invert(const Params& params, const PolyMatrixRaw& a);
+
+void FromNtt(const Params& params, PolyMatrixRaw& out, const PolyMatrixNtt& in);
+PolyMatrixRaw FromNtt(const Params& params, const PolyMatrixNtt& in);
+
+void ToNtt(const Params& params, PolyMatrixNtt& out, const PolyMatrixRaw& in);
+PolyMatrixNtt ToNtt(const Params& params, const PolyMatrixRaw& in);
+void ToNttNoReduce(const Params& params, PolyMatrixNtt& out,
+                   const PolyMatrixRaw& in);
+
+PolyMatrixRaw MatrixWithIdentity(const PolyMatrixRaw& p);
+
+}  // namespace psi::spiral
diff --git a/psi/algorithm/spiral/util.cc b/psi/algorithm/spiral/util.cc
new file mode 100644
index 0000000..fea34ee
--- /dev/null
+++ b/psi/algorithm/spiral/util.cc
@@ -0,0 +1,173 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/util.h"
+
+#include <cstddef>
+#include <utility>
+
+namespace psi::spiral::util {
+
+Params GetLargerParam() {
+  std::size_t poly_len{2048};
+  std::vector<std::uint64_t> moduli{268369921, 249561089};
+  double noise_width{6.4};
+
+  PolyMatrixParams poly_matrix_params(2, 256, 22, 3, 5, 5, 7);
+  QueryParams query_params(9, 6, 4, 32768);
+  std::size_t version{0};
+  return Params(poly_len, std::move(moduli), noise_width,
+                std::move(poly_matrix_params), std::move(query_params),
+                version);
+}
+
+Params GetTestParam() {
+  std::size_t poly_len{2048};
+  std::vector<std::uint64_t> moduli{268369921, 249561089};
+  double noise_width{6.4};
+
+  PolyMatrixParams poly_matrix_params(2, 256, 20, 4, 8, 56, 8);
+  QueryParams query_params(9, 6, 1, 2048);
+  std::size_t version{0};
+  return Params(poly_len, std::move(moduli), noise_width,
+                std::move(poly_matrix_params), std::move(query_params),
+                version);
+}
+
+Params GetPerformanceImproveParam() {
+  std::size_t poly_len{2048};
+  std::vector<std::uint64_t> moduli{268369921, 249561089};
+
+  double noise_width{6.4};
+
+  PolyMatrixParams poly_matrix_params(2, 256, 21, 4, 8, 8, 8);
+  QueryParams query_params(9, 6, 1, 2048);
+  std::size_t version{0};
+  return Params(poly_len, std::move(moduli), noise_width,
+                std::move(poly_matrix_params), std::move(query_params),
+                version);
+}
+
+Params GetFastExpansionTestingParam() {
+  std::size_t poly_len{2048};
+  std::vector<std::uint64_t> moduli{268369921, 249561089};
+  double noise_width{6.4};
+
+  PolyMatrixParams poly_matrix_params(2, 256, 20, 4, 8, 8, 8);
+
+  QueryParams query_params(6, 2, 1, 8192);
+
+  std::size_t version{0};
+
+  return Params(poly_len, std::move(moduli), noise_width,
+                std::move(poly_matrix_params), std::move(query_params),
+                version);
+}
+
+std::size_t CalcIndex(const std::vector<std::size_t>& indices,
+                      const std::vector<std::size_t>& length) {
+  std::size_t idx{0};
+  std::size_t prod{1};
+
+  for (size_t i = indices.size(); i-- > 0;) {
+    idx += (indices[i] * prod);
+    prod *= length[i];
+  }
+  return idx;
+}
+
+std::vector<uint8_t> ConvertCoeffsToBytes(
+    const std::vector<uint64_t>& coeff_array, size_t logt) {
+  size_t len = arith::UintNum(coeff_array.size() * logt, 8);
+  std::vector<uint8_t> bytes(len);
+
+  size_t room = 8;
+  size_t j = 0;
+  for (const auto l : coeff_array) {
+    uint64_t src = l;
+    size_t rest = logt;
+    while (rest != 0 && j < bytes.size()) {
+      size_t shift = std::min(room, rest);
+      bytes[j] = (bytes[j] << shift);
+      bytes[j] = (bytes[j] | (src >> (logt - shift)));
+      src = src << shift;
+      room -= shift;
+      rest -= shift;
+      if (room == 0) {
+        ++j;
+        room = 8;
+      }
+    }
+  }
+  return bytes;
+}
+
+std::vector<uint8_t> ConvertCoeffsToBytes(absl::Span<uint64_t> coeff_array,
+                                          size_t logt) {
+  size_t len = arith::UintNum(coeff_array.size() * logt, 8);
+  std::vector<uint8_t> bytes(len);
+
+  size_t room = 8;
+  size_t j = 0;
+  for (const auto l : coeff_array) {
+    uint64_t src = l;
+    size_t rest = logt;
+    while (rest != 0 && j < bytes.size()) {
+      size_t shift = std::min(room, rest);
+      bytes[j] = (bytes[j] << shift);
+      bytes[j] = (bytes[j] | (src >> (logt - shift)));
+      src = src << shift;
+      room -= shift;
+      rest -= shift;
+      if (room == 0) {
+        ++j;
+        room = 8;
+      }
+    }
+  }
+  return bytes;
+}
+
+std::vector<uint64_t> ConvertBytesToCoeffs(
+    size_t logt, size_t offset, size_t size,
+    const std::vector<uint8_t>& byte_array) {
+  size_t coeff_array_size = arith::UintNum(8 * size, logt);
+  std::vector<uint64_t> coeff_array(coeff_array_size);
+
+  size_t room = logt;
+  size_t flag = 0;
+  for (size_t i = 0; i < size; ++i) {
+    uint32_t src = static_cast<uint32_t>(byte_array[i + offset]);
+
+    size_t rest = 8;
+    while (rest != 0) {
+      if (room == 0) {
+        flag += 1;
+        room = logt;
+      }
+      size_t shift = std::min(room, rest);
+      uint64_t temp = coeff_array[flag] << shift;
+      coeff_array[flag] = temp | (src >> (8 - shift));
+      size_t remain = (1 << (8 - shift)) - 1;
+
+      src = (src & remain) << shift;
+      room -= shift;
+      rest -= shift;
+    }
+  }
+  coeff_array[flag] = coeff_array[flag] << room;
+  return coeff_array;
+}
+
+}  // namespace psi::spiral::util
diff --git a/psi/algorithm/spiral/util.h b/psi/algorithm/spiral/util.h
new file mode 100644
index 0000000..2213cd0
--- /dev/null
+++ b/psi/algorithm/spiral/util.h
@@ -0,0 +1,46 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "psi/algorithm/spiral/params.h"
+
+namespace psi::spiral::util {
+
+Params GetFastExpansionTestingParam();
+
+Params GetTestParam();
+
+Params GetPerformanceImproveParam();
+
+Params GetLargerParam();
+
+// calc multi-dimension position in 1-deimension's position
+std::size_t CalcIndex(const std::vector<std::size_t>& indices,
+                      const std::vector<std::size_t>& length);
+
+std::vector<uint64_t> ConvertBytesToCoeffs(
+    size_t logt, size_t offset, size_t size,
+    const std::vector<uint8_t>& byte_array);
+
+std::vector<uint8_t> ConvertCoeffsToBytes(
+    const std::vector<uint64_t>& coeff_array, size_t logt);
+
+std::vector<uint8_t> ConvertCoeffsToBytes(absl::Span<uint64_t> coeff_array,
+                                          size_t logt);
+
+}  // namespace psi::spiral::util
diff --git a/bazel/sparsehash.BUILD b/psi/apps/pir_client/BUILD.bazel
similarity index 58%
rename from bazel/sparsehash.BUILD
rename to psi/apps/pir_client/BUILD.bazel
index 8fe7df0..8289c83 100644
--- a/bazel/sparsehash.BUILD
+++ b/psi/apps/pir_client/BUILD.bazel
@@ -1,3 +1,7 @@
+# Copyright 2024 Ant Group Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
@@ -7,17 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "sparsehash",
-    hdrs = glob([
-        "src/google/**/*",
-        "src/sparsehash/**/*",
-    ]),
-    includes = ["src"],
-    visibility = ["//visibility:public"],
-)
diff --git a/bazel/double-conversion.BUILD b/psi/apps/pir_server/BUILD.bazel
similarity index 57%
rename from bazel/double-conversion.BUILD
rename to psi/apps/pir_server/BUILD.bazel
index 1067d02..8289c83 100644
--- a/bazel/double-conversion.BUILD
+++ b/psi/apps/pir_server/BUILD.bazel
@@ -1,4 +1,4 @@
-# Copyright 2023 Ant Group Co., Ltd.
+# Copyright 2024 Ant Group Co., Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,21 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "all_srcs",
-    srcs = glob(["**"]),
-)
-
-cmake(
-    name = "double-conversion",
-    cache_entries = {
-        "CMAKE_INSTALL_LIBDIR": "lib",
-    },
-    lib_source = ":all_srcs",
-    out_static_libs = ["libdouble-conversion.a"],
-)
diff --git a/psi/apps/psi_launcher/BUILD.bazel b/psi/apps/psi_launcher/BUILD.bazel
new file mode 100644
index 0000000..c506cdd
--- /dev/null
+++ b/psi/apps/psi_launcher/BUILD.bazel
@@ -0,0 +1,96 @@
+# Copyright 2023 Ant Group Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load("//bazel:psi.bzl", "psi_cc_binary", "psi_cc_library", "psi_cc_test")
+
+package(default_visibility = ["//visibility:public"])
+
+psi_cc_library(
+    name = "factory",
+    srcs = ["factory.cc"],
+    hdrs = ["factory.h"],
+    deps = [
+        "//psi/algorithm/ecdh:receiver",
+        "//psi/algorithm/ecdh:sender",
+        "//psi/algorithm/ecdh/ub_psi:client",
+        "//psi/algorithm/ecdh/ub_psi:server",
+        "//psi/algorithm/kkrt:receiver",
+        "//psi/algorithm/kkrt:sender",
+        "//psi/algorithm/rr22:receiver",
+        "//psi/algorithm/rr22:sender",
+        "@yacl//yacl/base:exception",
+    ],
+)
+
+psi_cc_library(
+    name = "launch",
+    srcs = ["launch.cc"],
+    hdrs = ["launch.h"],
+    deps = [
+        ":factory",
+        "//psi:trace_categories",
+        "//psi/legacy:bucket_psi",
+        "//psi/wrapper/apsi/cli:entry",
+        "@boost.algorithm//:boost.algorithm",
+    ],
+)
+
+psi_cc_test(
+    name = "psi_test",
+    srcs = ["psi_test.cc"],
+    flaky = True,
+    deps = [
+        ":factory",
+        "//psi/utils:arrow_csv_batch_provider",
+        "@yacl//yacl/utils:scope_guard",
+    ],
+)
+
+psi_cc_library(
+    name = "kuscia_adapter",
+    srcs = [
+        "kuscia_adapter.cc",
+    ],
+    hdrs = [
+        "kuscia_adapter.h",
+    ],
+    deps = [
+        "//psi/proto:entry_cc_proto",
+        "//psi/proto:kuscia_cc_proto",
+        "@protobuf",
+        "@rapidjson",
+        "@yacl//yacl/base:exception",
+    ],
+)
+
+psi_cc_test(
+    name = "kuscia_adapter_test",
+    srcs = ["kuscia_adapter_test.cc"],
+    deps = [
+        ":kuscia_adapter",
+    ],
+)
+
+psi_cc_binary(
+    name = "main",
+    srcs = ["main.cc"],
+    deps = [
+        ":kuscia_adapter",
+        ":launch",
+        "//psi:version",
+        "//psi/proto:entry_cc_proto",
+        "//psi/utils:resource_manager",
+        "@gflags",
+    ],
+)
diff --git a/psi/factory.cc b/psi/apps/psi_launcher/factory.cc
similarity index 86%
rename from psi/factory.cc
rename to psi/apps/psi_launcher/factory.cc
index 9ffb1b5..f57f1eb 100644
--- a/psi/factory.cc
+++ b/psi/apps/psi_launcher/factory.cc
@@ -12,20 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/factory.h"
+#include "psi/apps/psi_launcher/factory.h"
 
 #include <memory>
 
 #include "yacl/base/exception.h"
 
-#include "psi/ecdh/receiver.h"
-#include "psi/ecdh/sender.h"
-#include "psi/ecdh/ub_psi/client.h"
-#include "psi/ecdh/ub_psi/server.h"
-#include "psi/kkrt/receiver.h"
-#include "psi/kkrt/sender.h"
-#include "psi/rr22/receiver.h"
-#include "psi/rr22/sender.h"
+#include "psi/algorithm/ecdh/receiver.h"
+#include "psi/algorithm/ecdh/sender.h"
+#include "psi/algorithm/ecdh/ub_psi/client.h"
+#include "psi/algorithm/ecdh/ub_psi/server.h"
+#include "psi/algorithm/kkrt/receiver.h"
+#include "psi/algorithm/kkrt/sender.h"
+#include "psi/algorithm/rr22/receiver.h"
+#include "psi/algorithm/rr22/sender.h"
 
 namespace psi {
 
diff --git a/psi/factory.h b/psi/apps/psi_launcher/factory.h
similarity index 100%
rename from psi/factory.h
rename to psi/apps/psi_launcher/factory.h
diff --git a/psi/kuscia_adapter.cc b/psi/apps/psi_launcher/kuscia_adapter.cc
similarity index 97%
rename from psi/kuscia_adapter.cc
rename to psi/apps/psi_launcher/kuscia_adapter.cc
index 86ed21e..e2e5f27 100644
--- a/psi/kuscia_adapter.cc
+++ b/psi/apps/psi_launcher/kuscia_adapter.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/kuscia_adapter.h"
+#include "psi/apps/psi_launcher/kuscia_adapter.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -97,4 +97,4 @@ KusciaTask FromKusciaConfig(const std::string& json_str) {
   return kuscia_task;
 }
 
-}  // namespace psi
\ No newline at end of file
+}  // namespace psi
diff --git a/psi/kuscia_adapter.h b/psi/apps/psi_launcher/kuscia_adapter.h
similarity index 100%
rename from psi/kuscia_adapter.h
rename to psi/apps/psi_launcher/kuscia_adapter.h
diff --git a/psi/kuscia_adapter_test.cc b/psi/apps/psi_launcher/kuscia_adapter_test.cc
similarity index 99%
rename from psi/kuscia_adapter_test.cc
rename to psi/apps/psi_launcher/kuscia_adapter_test.cc
index eafaecd..51fda21 100644
--- a/psi/kuscia_adapter_test.cc
+++ b/psi/apps/psi_launcher/kuscia_adapter_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/kuscia_adapter.h"
+#include "psi/apps/psi_launcher/kuscia_adapter.h"
 
 #include <gtest/gtest.h>
 
diff --git a/psi/launch.cc b/psi/apps/psi_launcher/launch.cc
similarity index 98%
rename from psi/launch.cc
rename to psi/apps/psi_launcher/launch.cc
index a56265d..556c8e1 100644
--- a/psi/launch.cc
+++ b/psi/apps/psi_launcher/launch.cc
@@ -15,7 +15,7 @@
 // perfetto usage is adapted from
 // https://github.com/google/perfetto/blob/master/examples/sdk/example.cc
 
-#include "psi/launch.h"
+#include "psi/apps/psi_launcher/launch.h"
 
 #include <fstream>
 
@@ -23,12 +23,12 @@
 #include "google/protobuf/util/json_util.h"
 #include "perfetto.h"
 #include "spdlog/spdlog.h"
-#include "utils/random_str.h"
 
-#include "psi/apsi_wrapper/cli/entry.h"
-#include "psi/factory.h"
+#include "psi/apps/psi_launcher/factory.h"
 #include "psi/prelude.h"
 #include "psi/trace_categories.h"
+#include "psi/utils/random_str.h"
+#include "psi/wrapper/apsi/cli/entry.h"
 
 namespace psi {
 namespace {
diff --git a/psi/launch.h b/psi/apps/psi_launcher/launch.h
similarity index 100%
rename from psi/launch.h
rename to psi/apps/psi_launcher/launch.h
diff --git a/psi/main.cc b/psi/apps/psi_launcher/main.cc
similarity index 98%
rename from psi/main.cc
rename to psi/apps/psi_launcher/main.cc
index 88dd171..cf71324 100644
--- a/psi/main.cc
+++ b/psi/apps/psi_launcher/main.cc
@@ -19,8 +19,8 @@
 #include "google/protobuf/util/json_util.h"
 #include "spdlog/spdlog.h"
 
-#include "psi/kuscia_adapter.h"
-#include "psi/launch.h"
+#include "psi/apps/psi_launcher/kuscia_adapter.h"
+#include "psi/apps/psi_launcher/launch.h"
 #include "psi/utils/resource_manager.h"
 #include "psi/version.h"
 
diff --git a/psi/psi_test.cc b/psi/apps/psi_launcher/psi_test.cc
similarity index 99%
rename from psi/psi_test.cc
rename to psi/apps/psi_launcher/psi_test.cc
index bb81c34..819fbec 100644
--- a/psi/psi_test.cc
+++ b/psi/apps/psi_launcher/psi_test.cc
@@ -30,10 +30,9 @@
 #include "fmt/format.h"
 #include "gtest/gtest.h"
 #include "spdlog/spdlog.h"
-#include "utils/random_str.h"
 #include "yacl/link/test_util.h"
 
-#include "psi/factory.h"
+#include "psi/apps/psi_launcher/factory.h"
 #include "psi/prelude.h"
 #include "psi/utils/io.h"
 #include "psi/utils/random_str.h"
diff --git a/psi/cryptor/BUILD.bazel b/psi/cryptor/BUILD.bazel
index 4d6461b..6f5f108 100644
--- a/psi/cryptor/BUILD.bazel
+++ b/psi/cryptor/BUILD.bazel
@@ -22,7 +22,7 @@ psi_cc_library(
     hdrs = ["ecc_cryptor.h"],
     deps = [
         "//psi/proto:psi_cc_proto",
-        "@com_google_absl//absl/types:span",
+        "@abseil-cpp//absl/types:span",
         "@yacl//yacl/base:exception",
         "@yacl//yacl/crypto/ecc",
         "@yacl//yacl/utils:parallel",
@@ -52,8 +52,8 @@ psi_cc_library(
     deps = [
         ":ecc_cryptor",
         ":hash_to_curve_elligator2",
-        "@com_github_intel_ipp//:ipp",
-        "@com_github_openssl_openssl//:openssl",
+        "@ippcp//:ipp",
+        "@openssl",
         "@yacl//yacl/base:exception",
         "@yacl//yacl/utils:parallel",
     ],
@@ -108,7 +108,7 @@ psi_cc_library(
     # Openssl::libcrypto requires `dlopen`...
     linkopts = ["-ldl"],
     deps = [
-        "@com_github_openssl_openssl//:openssl",
+        "@openssl",
         "@yacl//yacl/base:exception",
         "@yacl//yacl/crypto/hash:hash_utils",
         "@yacl//yacl/utils:parallel",
diff --git a/psi/legacy/BUILD.bazel b/psi/legacy/BUILD.bazel
index 644131c..6153b82 100644
--- a/psi/legacy/BUILD.bazel
+++ b/psi/legacy/BUILD.bazel
@@ -34,7 +34,7 @@ psi_cc_library(
     deps = [
         ":base_operator",
         ":factory",
-        "//psi/ecdh:ecdh_3pc_psi",
+        "//psi/algorithm/ecdh:ecdh_3pc_psi",
     ],
     alwayslink = True,
 )
@@ -46,7 +46,7 @@ psi_cc_library(
     deps = [
         ":base_operator",
         ":factory",
-        "//psi/kkrt:kkrt_psi",
+        "//psi/algorithm/kkrt:kkrt_psi",
         "@yacl//yacl/utils:parallel",
     ],
     alwayslink = True,
@@ -60,7 +60,7 @@ psi_cc_library(
         ":base_operator",
         ":factory",
         ":kkrt_2party_psi",
-        "//psi/ecdh:ecdh_psi",
+        "//psi/algorithm/ecdh:ecdh_psi",
         "@yacl//yacl/utils:parallel",
     ],
     alwayslink = True,
@@ -86,18 +86,6 @@ psi_cc_library(
     ],
 )
 
-psi_cc_library(
-    name = "kmprt17_mp_psi",
-    srcs = ["kmprt17_mp_psi.cc"],
-    hdrs = ["kmprt17_mp_psi.h"],
-    deps = [
-        ":base_operator",
-        ":factory",
-        "//psi/legacy/kmprt17_mp_psi",
-    ],
-    alwayslink = True,
-)
-
 psi_cc_library(
     name = "dp_2party_psi",
     srcs = ["dp_2party_psi.cc"],
@@ -110,28 +98,12 @@ psi_cc_library(
     alwayslink = True,
 )
 
-psi_cc_library(
-    name = "rr22_2party_psi",
-    srcs = ["rr22_2party_psi.cc"],
-    hdrs = ["rr22_2party_psi.h"],
-    deps = [
-        ":base_operator",
-        ":factory",
-        "//psi/rr22:rr22_psi",
-        "@yacl//yacl/utils:parallel",
-    ],
-    alwayslink = True,
-)
-
 psi_cc_library(
     name = "operator",
     deps = [
         ":dp_2party_psi",
         ":ecdh_3party_psi",
-        ":kkrt_2party_psi",
-        ":kmprt17_mp_psi",
         ":nparty_psi",
-        ":rr22_2party_psi",
     ],
 )
 
@@ -145,7 +117,6 @@ psi_cc_library(
         ":factory",
         ":operator",
         "//psi:prelude",
-        "//psi/ecdh:ecdh_psi",
         "//psi/proto:psi_cc_proto",
         "//psi/utils:sync",
     ],
@@ -172,7 +143,6 @@ psi_cc_library(
         "//psi/proto:psi_cc_proto",
         "//psi/utils:arrow_csv_batch_provider",
         "//psi/utils:csv_checker",
-        "//psi/utils:csv_header_analyzer",
         "//psi/utils:ec_point_store",
         "//psi/utils:progress",
     ],
diff --git a/psi/legacy/bucket_psi.cc b/psi/legacy/bucket_psi.cc
index a19f778..a633a58 100644
--- a/psi/legacy/bucket_psi.cc
+++ b/psi/legacy/bucket_psi.cc
@@ -34,11 +34,10 @@
 #include "yacl/crypto/rand/rand.h"
 #include "yacl/utils/serialize.h"
 
+#include "psi/algorithm/ecdh/ecdh_psi.h"
 #include "psi/cryptor/cryptor_selector.h"
-#include "psi/ecdh/ecdh_psi.h"
 #include "psi/prelude.h"
 #include "psi/utils/arrow_csv_batch_provider.h"
-#include "psi/utils/csv_header_analyzer.h"
 #include "psi/utils/ec_point_store.h"
 #include "psi/utils/io.h"
 #include "psi/utils/serialize.h"
@@ -406,66 +405,11 @@ std::vector<uint64_t> BucketPsi::RunPsi(std::shared_ptr<Progress>& progress,
   SPDLOG_INFO("Run psi protocol={}, self_items_count={}", config_.psi_type(),
               self_items_count);
 
-  if (config_.psi_type() == PsiType::ECDH_PSI_2PC) {
-    ecdh::EcdhPsiOptions psi_options;
-    if (config_.curve_type() == CurveType::CURVE_INVALID_TYPE) {
-      YACL_THROW("Unsupported curve type");
-    }
-    psi_options.ecc_cryptor = CreateEccCryptor(config_.curve_type());
-    psi_options.link_ctx = lctx_;
-    psi_options.target_rank = static_cast<size_t>(config_.receiver_rank());
-    if (config_.broadcast_result()) {
-      psi_options.target_rank = yacl::link::kAllRank;
-    }
-    psi_options.ic_mode = ic_mode_;
-
-    auto batch_provider = std::make_shared<ArrowCsvBatchProvider>(
-        config_.input_params().path(), selected_fields_,
-        psi_options.batch_size);
-    auto self_ec_point_store = std::make_shared<HashBucketEcPointStore>(
-        std::filesystem::path(config_.output_params().path()).parent_path(),
-        64);
-
-    auto peer_ec_point_store = std::make_shared<HashBucketEcPointStore>(
-        std::filesystem::path(config_.output_params().path()).parent_path(),
-        64);
-
-    // Hook progress->
-    if (progress) {
-      psi_options.on_batch_finished = [progress, self_items_count,
-                                       psi_options](size_t batch_count) {
-        size_t last_percent = progress->Get().percentage;
-
-        size_t completed = batch_count * psi_options.batch_size;
-        size_t total = std::max<size_t>(1, self_items_count);
-        size_t curr_percent = 100 * completed / total;
-        progress->Update(curr_percent);
-
-        // Log something.
-        constexpr size_t kLogEveryNPercent = 5;
-        if (curr_percent != last_percent &&
-            curr_percent % kLogEveryNPercent == 0) {
-          SPDLOG_INFO("ECDH progress {}%", curr_percent);
-        }
-      };
-    }
-
-    // Launch ECDH-PSI core.
-    ecdh::RunEcdhPsi(psi_options, batch_provider, self_ec_point_store,
-                     peer_ec_point_store);
-
-    std::vector<uint64_t> results;
-    results =
-        FinalizeAndComputeIndices(self_ec_point_store, peer_ec_point_store);
-
-    return results;
-  } else if ((config_.psi_type() == PsiType::ECDH_OPRF_UB_PSI_2PC_GEN_CACHE) ||
-             (config_.psi_type() ==
-              PsiType::ECDH_OPRF_UB_PSI_2PC_TRANSFER_CACHE) ||
-             (config_.psi_type() ==
-              PsiType::ECDH_OPRF_UB_PSI_2PC_SHUFFLE_ONLINE) ||
-             (config_.psi_type() == PsiType::ECDH_OPRF_UB_PSI_2PC_OFFLINE) ||
-             (config_.psi_type() == PsiType::ECDH_OPRF_UB_PSI_2PC_ONLINE)) {
+  if ((config_.psi_type() == PsiType::ECDH_OPRF_UB_PSI_2PC_GEN_CACHE) ||
+      (config_.psi_type() == PsiType::ECDH_OPRF_UB_PSI_2PC_TRANSFER_CACHE) ||
+      (config_.psi_type() == PsiType::ECDH_OPRF_UB_PSI_2PC_SHUFFLE_ONLINE) ||
+      (config_.psi_type() == PsiType::ECDH_OPRF_UB_PSI_2PC_OFFLINE) ||
+      (config_.psi_type() == PsiType::ECDH_OPRF_UB_PSI_2PC_ONLINE)) {
     YACL_THROW(
         "not support, please use new interface UbPsiConfig in psi_v2.proto.");
   } else {
diff --git a/psi/legacy/bucket_psi_test.cc b/psi/legacy/bucket_psi_test.cc
index 32e898b..2ee856a 100644
--- a/psi/legacy/bucket_psi_test.cc
+++ b/psi/legacy/bucket_psi_test.cc
@@ -190,73 +190,6 @@ TEST_P(StreamTaskPsiTest, BroadcastFalse) {
 INSTANTIATE_TEST_SUITE_P(
     Works_Instances, StreamTaskPsiTest,
     testing::Values(
-        TestParams{
-            {10, 15},
-            {"id,value\ng,1\nb,2\ns,1\nh,1\ne,1\nc,1\na,1\nj,1\nk,1\nl,1\n",
-             "id,value\ne,1\nc,1\na,1\nj,1\nk,1\nq,1\nw,1\nn,1\nr,1\nt,1\ny,"
-             "1\nu,1\ni,1\no,1\np,1\n"},
-            {"id,value\na,1\nc,1\ne,1\nj,1\nk,1\n",
-             "id,value\na,1\nc,1\ne,1\nj,1\nk,1\n"},
-            {{"id"}, {"id"}},
-            PsiType::ECDH_PSI_2PC,
-            64,
-            true,
-            false,
-            5,
-        },
-        TestParams{
-            {10, 15},
-            {"id,value\ng,1\nb,2\ns,1\nh,1\ne,1\nc,1\na,1\nj,1\nk,1\nl,1\n",
-             "id,value\ne,1\nc,1\na,1\nj,1\nk,1\nq,1\nw,1\nn,1\nr,1\nt,1\ny,"
-             "1\nu,1\ni,1\no,1\np,1\n"},
-            {"id,value\na,1\nc,1\ne,1\nj,1\nk,1\n",
-             "id,value\na,1\nc,1\ne,1\nj,1\nk,1\n"},
-            {{"id"}, {"id"}},
-            PsiType::ECDH_PSI_2PC,
-            64,
-            true,
-            true,
-            5,
-        },
-        TestParams{
-            {3, 3},
-            {"id,value\nc测试,c\nb测试,b\na测试,a\n",
-             "id,value\nb测试,b\nc测试,c\na测试,a\n"},
-            {"id,value\na测试,a\nb测试,b\nc测试,c\n",
-             "id,value\na测试,a\nb测试,b\nc测试,c\n"},
-            {{"id"}, {"id"}},
-            PsiType::ECDH_PSI_2PC,
-            64,
-            true,
-            false,
-            3,
-        },
-        TestParams{
-            {3, 3},
-            {"id,value\nc测试,c\nb测试,b\na测试,a\n",
-             "id,value\nb测试,b\nc测试,c\na测试,a\n"},
-            {"id,value\na测试,a\nb测试,b\nc测试,c\n",
-             "id,value\na测试,a\nb测试,b\nc测试,c\n"},
-            {{"id"}, {"id"}},
-            PsiType::ECDH_PSI_2PC,
-            64,
-            true,
-            true,
-            3,
-        },
-        TestParams{
-            {3, 3},
-            {"id,value\nc测试,c\nb测试,b\na测试,a\n",
-             "id,value\nb测试,b\nc测试,c\na测试,a\n"},
-            {"id,value\na测试,a\nb测试,b\nc测试,c\n",
-             "id,value\na测试,a\nb测试,b\nc测试,c\n"},
-            {{"id"}, {"id"}},
-            PsiType::KKRT_PSI_2PC,
-            64,
-            true,
-            false,
-            3,
-        },
         TestParams{
             {3, 3, 3},
             {"id,value\nc测试,c\nb测试,b\na测试,a\n",
@@ -308,28 +241,6 @@ INSTANTIATE_TEST_SUITE_P(
         },
 
         // one party empty
-        TestParams{
-            {3, 0},
-            {"id,value\nc测试,c\nb测试,b\na测试,a\n", "id,value\n"},
-            {"id,value\n", "id,value\n"},
-            {{"id"}, {"id"}},
-            PsiType::ECDH_PSI_2PC,
-            64,
-            true,
-            false,
-            0,
-        },
-        TestParams{
-            {3, 0},
-            {"id,value\nc测试,c\nb测试,b\na测试,a\n", "id,value\n"},
-            {"id,value\n", "id,value\n"},
-            {{"id"}, {"id"}},
-            PsiType::KKRT_PSI_2PC,
-            64,
-            true,
-            false,
-            0,
-        },
         TestParams{
             {3, 0, 3},
             {"id,value\nc测试,c\nb测试,b\na测试,a\n", "id,value\n",
@@ -370,39 +281,6 @@ INSTANTIATE_TEST_SUITE_P(
         },
 
         // multi key
-        TestParams{
-            {3, 2},
-            {"f2,id\n1,a\n1,b\n6,c\n", "f1,id\n1,b\n6,c\n"},
-            {"f2,id\n1,b\n6,c\n", "f1,id\n1,b\n6,c\n"},
-            {{"f2", "id"}, {"f1", "id"}},
-            PsiType::ECDH_PSI_2PC,
-            64,
-            true,
-            false,
-            2,
-        },
-        TestParams{
-            {3, 2},
-            {"f2,id\n1,a\n1,b\n6,c\n", "f1,id\n1,b\n6,c\n"},
-            {"f2,id\n1,b\n6,c\n", "f1,id\n1,b\n6,c\n"},
-            {{"f2", "id"}, {"f1", "id"}},
-            PsiType::ECDH_PSI_2PC,
-            64,
-            true,
-            true,
-            2,
-        },
-        TestParams{
-            {3, 2},
-            {"f2,id\n1,a\n1,b\n6,c\n", "f1,id\n1,b\n6,c\n"},
-            {"f2,id\n1,b\n6,c\n", "f1,id\n1,b\n6,c\n"},
-            {{"f2", "id"}, {"f1", "id"}},
-            PsiType::KKRT_PSI_2PC,
-            64,
-            true,
-            false,
-            2,
-        },
         TestParams{
             {3, 2, 1},
             {"f2,id\n1,a\n1,b\n6,c\n", "f1,id\n1,b\n6,c\n", "f3,id\n1,b\n"},
@@ -465,12 +343,9 @@ TEST_P(BucketTaskPsiTestFailedTest, FailedWorks) {
 INSTANTIATE_TEST_SUITE_P(FailedWorks_Instances, BucketTaskPsiTestFailedTest,
                          testing::Values(
                              // invalid link world size
-                             FailedTestParams{3, 0, PsiType::KKRT_PSI_2PC},
-                             FailedTestParams{4, 0, PsiType::ECDH_PSI_2PC},
                              FailedTestParams{2, 0, PsiType::ECDH_PSI_3PC},
                              // invalid receiver_rank
                              FailedTestParams{3, 4, PsiType::ECDH_PSI_3PC},
-                             FailedTestParams{2, 5, PsiType::KKRT_PSI_2PC},
                              // invalid psi_type
                              FailedTestParams{3, 4,
                                               PsiType::INVALID_PSI_TYPE}));
diff --git a/psi/legacy/dp_psi/BUILD.bazel b/psi/legacy/dp_psi/BUILD.bazel
index 7c5f660..de1204a 100644
--- a/psi/legacy/dp_psi/BUILD.bazel
+++ b/psi/legacy/dp_psi/BUILD.bazel
@@ -36,13 +36,13 @@ psi_cc_library(
     ],
     deps = [
         ":dp_psi_utils",
+        "//psi/algorithm/ecdh:ecdh_3pc_psi",
+        "//psi/algorithm/ecdh:ecdh_psi",
         "//psi/cryptor:cryptor_selector",
-        "//psi/ecdh:ecdh_3pc_psi",
-        "//psi/ecdh:ecdh_psi",
         "//psi/utils:batch_provider_impl",
         "//psi/utils:ec_point_store",
         "//psi/utils:serialize",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@yacl//yacl/base:exception",
         "@yacl//yacl/crypto/rand",
         "@yacl//yacl/link",
@@ -55,7 +55,7 @@ psi_cc_test(
     srcs = ["dp_psi_test.cc"],
     deps = [
         ":dp_psi",
-        "@com_google_absl//absl/container:flat_hash_set",
+        "@abseil-cpp//absl/container:flat_hash_set",
     ],
 )
 
@@ -64,8 +64,8 @@ psi_cc_binary(
     srcs = ["dp_psi_benchmark.cc"],
     deps = [
         ":dp_psi",
+        "@abseil-cpp//absl/container:flat_hash_set",
         "@com_github_google_benchmark//:benchmark_main",
-        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -74,7 +74,7 @@ psi_cc_binary(
     srcs = ["dp_psi_payload_benchmark.cc"],
     deps = [
         ":dp_psi",
+        "@abseil-cpp//absl/container:flat_hash_set",
         "@com_github_google_benchmark//:benchmark_main",
-        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
diff --git a/psi/legacy/dp_psi/dp_psi.cc b/psi/legacy/dp_psi/dp_psi.cc
index 6441a78..2dd7cdf 100644
--- a/psi/legacy/dp_psi/dp_psi.cc
+++ b/psi/legacy/dp_psi/dp_psi.cc
@@ -24,8 +24,8 @@
 #include "yacl/crypto/rand/rand.h"
 #include "yacl/utils/parallel.h"
 
+#include "psi/algorithm/ecdh/ecdh_3pc_psi.h"
 #include "psi/cryptor/cryptor_selector.h"
-#include "psi/ecdh/ecdh_3pc_psi.h"
 #include "psi/legacy/dp_psi/dp_psi_utils.h"
 #include "psi/utils/batch_provider_impl.h"
 #include "psi/utils/communication.h"
diff --git a/psi/legacy/dp_psi/dp_psi.h b/psi/legacy/dp_psi/dp_psi.h
index 7880791..89f43b2 100644
--- a/psi/legacy/dp_psi/dp_psi.h
+++ b/psi/legacy/dp_psi/dp_psi.h
@@ -23,7 +23,7 @@
 #include "spdlog/spdlog.h"
 #include "yacl/link/link.h"
 
-#include "psi/ecdh/ecdh_psi.h"
+#include "psi/algorithm/ecdh/ecdh_psi.h"
 
 namespace psi::dp_psi {
 
diff --git a/psi/legacy/ecdh_3party_psi.h b/psi/legacy/ecdh_3party_psi.h
index 1500f1c..71f5c33 100644
--- a/psi/legacy/ecdh_3party_psi.h
+++ b/psi/legacy/ecdh_3party_psi.h
@@ -19,7 +19,7 @@
 #include <string>
 #include <vector>
 
-#include "psi/ecdh/ecdh_3pc_psi.h"
+#include "psi/algorithm/ecdh/ecdh_3pc_psi.h"
 #include "psi/legacy/base_operator.h"
 
 namespace psi {
diff --git a/psi/legacy/kkrt_2party_psi.cc b/psi/legacy/kkrt_2party_psi.cc
index 3c63f38..c540664 100644
--- a/psi/legacy/kkrt_2party_psi.cc
+++ b/psi/legacy/kkrt_2party_psi.cc
@@ -19,7 +19,7 @@
 #include "yacl/crypto/hash/hash_utils.h"
 #include "yacl/utils/parallel.h"
 
-#include "psi/kkrt/kkrt_psi.h"
+#include "psi/algorithm/kkrt/kkrt_psi.h"
 #include "psi/legacy/factory.h"
 
 namespace psi {
diff --git a/psi/legacy/kmprt17_mp_psi.h b/psi/legacy/kmprt17_mp_psi.h
deleted file mode 100644
index ce51a84..0000000
--- a/psi/legacy/kmprt17_mp_psi.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2024 zhangwfjh
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "yacl/link/link.h"
-
-#include "psi/legacy/base_operator.h"
-#include "psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi.h"
-
-namespace psi::psi {
-
-class KmprtMpPsiOperator : public PsiBaseOperator {
- public:
-  using Options = KmprtParty::Options;
-
-  explicit KmprtMpPsiOperator(const Options& options)
-      : PsiBaseOperator(options.link_ctx), options_(options) {}
-
-  std::vector<std::string> OnRun(const std::vector<std::string>& inputs) final {
-    return KmprtParty{options_}.Run(inputs);
-  }
-
- private:
-  Options options_;
-};
-
-}  // namespace psi::psi
diff --git a/psi/legacy/memory_psi.cc b/psi/legacy/memory_psi.cc
index 7cce441..08ec7f1 100644
--- a/psi/legacy/memory_psi.cc
+++ b/psi/legacy/memory_psi.cc
@@ -16,7 +16,6 @@
 
 #include "spdlog/spdlog.h"
 
-#include "psi/ecdh/ecdh_psi.h"
 #include "psi/legacy/factory.h"
 #include "psi/prelude.h"
 #include "psi/utils/sync.h"
@@ -40,13 +39,6 @@ void MemoryPsi::CheckOptions() const {
       lctx_->WorldSize());
 
   // check world size
-  if (config_.psi_type() == PsiType::ECDH_PSI_2PC ||
-      config_.psi_type() == PsiType::KKRT_PSI_2PC) {
-    YACL_ENFORCE(lctx_->WorldSize() == 2,
-                 "psi_type:{}, only two parties supported, got "
-                 "{}",
-                 config_.psi_type(), lctx_->WorldSize());
-  }
   if (config_.psi_type() == PsiType::ECDH_PSI_3PC) {
     if (lctx_->WorldSize() != 3) {
       YACL_ENFORCE(lctx_->WorldSize() == 3,
@@ -76,29 +68,11 @@ std::vector<std::string> MemoryPsi::Run(
     return res;
   }
 
-  if (config_.psi_type() == PsiType::ECDH_PSI_2PC) {
-    auto run_f = std::async([&] { return EcdhPsi(inputs); });
-    res = SyncWait(lctx_, &run_f);
-  } else {
-    res = OperatorFactory::GetInstance()
-              ->Create(config_, lctx_)
-              ->Run(inputs, config_.broadcast_result());
-  }
+  res = OperatorFactory::GetInstance()
+            ->Create(config_, lctx_)
+            ->Run(inputs, config_.broadcast_result());
 
   return res;
 }
 
-std::vector<std::string> MemoryPsi::EcdhPsi(
-    const std::vector<std::string>& inputs) {
-  size_t target_rank = config_.receiver_rank();
-  if (config_.broadcast_result()) {
-    target_rank = yacl::link::kAllRank;
-  }
-
-  if (config_.curve_type() != CurveType::CURVE_INVALID_TYPE) {
-    return ecdh::RunEcdhPsi(lctx_, inputs, target_rank, config_.curve_type());
-  }
-  return ecdh::RunEcdhPsi(lctx_, inputs, target_rank);
-}
-
 }  // namespace psi
diff --git a/psi/legacy/memory_psi.h b/psi/legacy/memory_psi.h
index 38450d3..117e920 100644
--- a/psi/legacy/memory_psi.h
+++ b/psi/legacy/memory_psi.h
@@ -36,8 +36,6 @@ class MemoryPsi {
  private:
   void CheckOptions() const;
 
-  std::vector<std::string> EcdhPsi(const std::vector<std::string>& inputs);
-
  private:
   MemoryPsiConfig config_;
 
diff --git a/psi/legacy/memory_psi_test.cc b/psi/legacy/memory_psi_test.cc
index 4b7cfb4..431b979 100644
--- a/psi/legacy/memory_psi_test.cc
+++ b/psi/legacy/memory_psi_test.cc
@@ -149,18 +149,9 @@ TEST_P(MemoryTaskPsiTest, BroadcastFalse) {
 INSTANTIATE_TEST_SUITE_P(
     Works_Instances, MemoryTaskPsiTest,
     testing::Values(
-        MemoryTaskTestParams{{0, 3}, 0, PsiType::ECDH_PSI_2PC},        //
-        MemoryTaskTestParams{{3, 0}, 0, PsiType::KKRT_PSI_2PC},        //
-        MemoryTaskTestParams{{0, 0}, 0, PsiType::KKRT_PSI_2PC},        //
         MemoryTaskTestParams{{4, 3, 0}, 0, PsiType::ECDH_PSI_3PC},     //
         MemoryTaskTestParams{{4, 3, 0, 6}, 0, PsiType::ECDH_PSI_NPC},  //
 
-        //
-        MemoryTaskTestParams{{20, 20}, 10, PsiType::KKRT_PSI_2PC},  //
-        MemoryTaskTestParams{{20, 17}, 10, PsiType::KKRT_PSI_2PC},  //
-        MemoryTaskTestParams{{17, 20}, 10, PsiType::KKRT_PSI_2PC},  //
-        MemoryTaskTestParams{{33, 45}, 20, PsiType::ECDH_PSI_2PC},  //
-
         MemoryTaskTestParams{{20, 17, 14}, 10, PsiType::ECDH_PSI_3PC},      //
         MemoryTaskTestParams{{20, 17, 14, 30}, 10, PsiType::ECDH_PSI_NPC},  //
         MemoryTaskTestParams{{20, 17, 14, 30, 35}, 11, PsiType::KKRT_PSI_NPC}));
@@ -190,8 +181,6 @@ TEST_P(MemoryTaskPsiTestFailedTest, FailedWorks) {
 INSTANTIATE_TEST_SUITE_P(FailedWorks_Instances, MemoryTaskPsiTestFailedTest,
                          testing::Values(
                              // invalid link world size
-                             FailedTestParams{3, 0, PsiType::KKRT_PSI_2PC},
-                             FailedTestParams{4, 0, PsiType::ECDH_PSI_2PC},
                              FailedTestParams{2, 0, PsiType::ECDH_PSI_3PC},
                              // invalid receiver_rank
                              FailedTestParams{3, 4, PsiType::ECDH_PSI_3PC},
@@ -200,4 +189,4 @@ INSTANTIATE_TEST_SUITE_P(FailedWorks_Instances, MemoryTaskPsiTestFailedTest,
                              FailedTestParams{3, 4,
                                               PsiType::INVALID_PSI_TYPE}));
 
-}  // namespace psi
\ No newline at end of file
+}  // namespace psi
diff --git a/psi/legacy/mini_psi/BUILD.bazel b/psi/legacy/mini_psi/BUILD.bazel
deleted file mode 100644
index 46a4c6e..0000000
--- a/psi/legacy/mini_psi/BUILD.bazel
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright 2024 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//bazel:psi.bzl", "psi_cc_binary", "psi_cc_library", "psi_cc_test")
-
-package(default_visibility = ["//visibility:public"])
-
-psi_cc_library(
-    name = "polynomial",
-    srcs = [
-        "polynomial.cc",
-    ],
-    hdrs = [
-        "polynomial.h",
-    ],
-    deps = [
-        "@com_github_openssl_openssl//:openssl",
-        "@com_google_absl//absl/strings",
-        "@yacl//yacl/base:exception",
-    ],
-)
-
-psi_cc_test(
-    name = "polynomial_test",
-    srcs = ["polynomial_test.cc"],
-    deps = [
-        ":polynomial",
-        "@yacl//yacl/crypto/tools:prg",
-    ],
-)
-
-psi_cc_library(
-    name = "mini_psi",
-    srcs = ["mini_psi.cc"],
-    hdrs = ["mini_psi.h"],
-    defines = ["CURVE25519_DONNA"],
-    deps = [
-        ":polynomial",
-        "//psi/utils:batch_provider_impl",
-        "//psi/utils:communication",
-        "//psi/utils:cuckoo_index",
-        "//psi/utils:serialize",
-        "//psi/utils:test_utils",
-        "@com_github_floodyberry_curve25519_donna//:curve25519_donna",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
-        "@yacl//yacl/crypto/hash:hash_utils",
-        "@yacl//yacl/crypto/tools:prg",
-        "@yacl//yacl/link",
-        "@yacl//yacl/utils:parallel",
-    ],
-)
-
-psi_cc_test(
-    name = "mini_psi_test",
-    srcs = ["mini_psi_test.cc"],
-    flaky = True,
-    deps = [
-        ":mini_psi",
-    ],
-)
-
-psi_cc_binary(
-    name = "mini_psi_demo",
-    srcs = ["mini_psi_demo.cc"],
-    deps = [
-        ":mini_psi",
-        "//psi/ecdh:ecdh_psi",
-    ],
-)
diff --git a/psi/legacy/mini_psi/mini_psi.cc b/psi/legacy/mini_psi/mini_psi.cc
deleted file mode 100644
index 31310fe..0000000
--- a/psi/legacy/mini_psi/mini_psi.cc
+++ /dev/null
@@ -1,624 +0,0 @@
-// Copyright 2022 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "psi/legacy/mini_psi/mini_psi.h"
-
-#include <future>
-#include <map>
-#include <random>
-#include <set>
-#include <unordered_set>
-
-#include "absl/strings/escaping.h"
-#include "absl/strings/string_view.h"
-#include "omp.h"
-#include "openssl/crypto.h"
-#include "openssl/rand.h"
-#include "spdlog/spdlog.h"
-
-extern "C" {
-#include "curve25519.h"
-}
-
-#include "yacl/base/exception.h"
-#include "yacl/crypto/block_cipher/symmetric_crypto.h"
-#include "yacl/crypto/hash/hash_utils.h"
-#include "yacl/crypto/tools/prg.h"
-#include "yacl/utils/parallel.h"
-
-#include "psi/legacy/mini_psi/polynomial.h"
-#include "psi/utils/batch_provider_impl.h"
-#include "psi/utils/communication.h"
-#include "psi/utils/cuckoo_index.h"
-#include "psi/utils/serialize.h"
-
-namespace psi::mini_psi {
-
-namespace {
-
-constexpr uint32_t kLinkRecvTimeout = 30 * 60 * 1000;
-// first prime over 2^256, used as module for polynomial interpolate
-std::string kPrimeOver256bHexStr =
-    "010000000000000000000000000000000000000000000000000000000000000129";
-
-// batch size of Cuckoo Hash
-constexpr size_t kCuckooHashBatchSize = 2000;
-
-std::vector<std::string> HashInputs(const std::vector<std::string>& items) {
-  std::vector<std::string> ret(items.size());
-  yacl::parallel_for(0, items.size(), [&](int64_t begin, int64_t end) {
-    for (int64_t idx = begin; idx < end; ++idx) {
-      auto hash = yacl::crypto::Sha256(items[idx]);
-      ret[idx].resize(hash.size());
-      std::memcpy(ret[idx].data(), hash.data(), hash.size());
-    }
-  });
-  return ret;
-}
-
-struct MiniPsiSendCtx {
-  MiniPsiSendCtx() {
-    yacl::crypto::Prg<uint64_t> prg(0, yacl::crypto::PRG_MODE::kAesEcb);
-    prg.Fill(absl::MakeSpan(private_key.data(), kKeySize));
-
-    curve25519_donna_basepoint(static_cast<unsigned char*>(public_key.data()),
-                               private_key.data());
-
-    uint128_t aes_key = yacl::crypto::Blake3_128(public_key);
-    aes_ecb = std::make_shared<yacl::crypto::SymmetricCrypto>(
-        yacl::crypto::SymmetricCrypto::CryptoType::AES128_ECB, aes_key, 0);
-
-    (void)absl::HexStringToBytes(kPrimeOver256bHexStr, &prime256_str);
-  }
-
-  void RecvPolynomialCoeff(
-      const std::shared_ptr<yacl::link::Context>& link_ctx) {
-    size_t batch_count = 0;
-
-    yacl::link::RecvTimeoutGuard guard(link_ctx, kLinkRecvTimeout);
-    while (true) {
-      const auto tag = fmt::format("MINI-PSI:X^A:{}", batch_count);
-      PsiDataBatch coeff_batch =
-          PsiDataBatch::Deserialize(link_ctx->Recv(link_ctx->NextRank(), tag));
-      // Fetch y^b.
-      YACL_ENFORCE(coeff_batch.flatten_bytes.size() % kHashSize == 0);
-      size_t num_items = coeff_batch.flatten_bytes.size() / kHashSize;
-
-      if (num_items > 0) {
-        absl::string_view flatten_bytes = coeff_batch.flatten_bytes;
-
-        for (size_t i = 0; i < num_items; ++i) {
-          polynomial_coeff.emplace_back(
-              flatten_bytes.substr(i * kHashSize, kHashSize));
-        }
-      }
-
-      if (coeff_batch.is_last_batch) {
-        break;
-      }
-      batch_count++;
-    }
-  }
-
-  void EvalPolynomial(const std::vector<std::string>& items) {
-    polynomial_eval_values.resize(items.size());
-    masked_values.resize(items.size());
-
-    items_hash = HashInputs(items);
-
-    yacl::parallel_for(0, items.size(), [&](int64_t begin, int64_t end) {
-      for (int64_t idx = begin; idx < end; ++idx) {
-        polynomial_eval_values[idx] = ::psi::mini_psi::EvalPolynomial(
-            polynomial_coeff, absl::string_view(items_hash[idx]), prime256_str);
-
-        std::array<uint8_t, kKeySize> ideal_permutation;
-        // Ideal Permutation
-        aes_ecb->Decrypt(absl::MakeSpan(reinterpret_cast<uint8_t*>(
-                                            polynomial_eval_values[idx].data()),
-                                        polynomial_eval_values[idx].length()),
-                         absl::MakeSpan(ideal_permutation));
-
-        std::string masked(kKeySize, '\0');
-
-        curve25519_donna(
-            reinterpret_cast<unsigned char*>(masked.data()), private_key.data(),
-            static_cast<const unsigned char*>(ideal_permutation.data()));
-
-        yacl::crypto::Sha256Hash sha256;
-        sha256.Update(items[idx].data());
-        sha256.Update(masked.data());
-        std::vector<uint8_t> mask_hash = sha256.CumulativeHash();
-        masked_values[idx].resize(kFinalCompareBytes);
-        std::memcpy(masked_values[idx].data(), mask_hash.data(),
-                    kFinalCompareBytes);
-      }
-    });
-
-    // use sort as shuffle
-    std::sort(masked_values.begin(), masked_values.end());
-  }
-
-  void SendMaskedEvalValues(
-      const std::shared_ptr<yacl::link::Context>& link_ctx) {
-    size_t batch_count = 0;
-
-    std::shared_ptr<IBasicBatchProvider> batch_provider =
-        std::make_shared<MemoryBatchProvider>(masked_values, kEcdhPsiBatchSize);
-
-    while (true) {
-      PsiDataBatch batch;
-      // NOTE: we still need to send one batch even there is no data.
-      // This dummy batch is used to notify peer the end of data stream.
-      auto items = batch_provider->ReadNextBatch();
-      batch.is_last_batch = items.empty();
-      // Mask and Send this batch.
-      if (!items.empty()) {
-        batch.flatten_bytes.reserve(items.size() * kFinalCompareBytes);
-
-        for (const auto& item : items) {
-          batch.flatten_bytes.append(item);
-        }
-      }
-      // Send x^a.
-      const auto tag = fmt::format("MINI-PSI:X^A:{}", batch_count);
-      link_ctx->SendAsyncThrottled(link_ctx->NextRank(), batch.Serialize(),
-                                   tag);
-      if (batch.is_last_batch) {
-        SPDLOG_INFO("Last batch triggered, batch_count={}", batch_count);
-        break;
-      }
-      batch_count++;
-    }
-  }
-
-  // key
-  std::array<uint8_t, kKeySize> private_key;
-  std::array<uint8_t, kKeySize> public_key;
-
-  // next prime over 2^256
-  std::string prime256_str;
-
-  // hash of items
-  std::vector<std::string> items_hash;
-
-  // polynomial_coeff
-  std::vector<std::string> polynomial_coeff;
-
-  std::vector<std::string> polynomial_eval_values;
-  std::vector<std::string> masked_values;
-
-  // use aes-128-ecb as Ideal Permutation
-  std::shared_ptr<yacl::crypto::SymmetricCrypto> aes_ecb;
-};
-
-struct MiniPsiRecvCtx {
-  MiniPsiRecvCtx() {
-    (void)absl::HexStringToBytes(kPrimeOver256bHexStr, &prime256_str);
-  }
-
-  void GenerateSeeds(size_t data_size) {
-    seeds.resize(data_size);
-    seeds_point.resize(data_size);
-
-    yacl::parallel_for(0, data_size, [&](int64_t begin, int64_t end) {
-      for (int64_t idx = begin; idx < end; ++idx) {
-        yacl::crypto::Prg<uint64_t> prg(0, yacl::crypto::PRG_MODE::kAesEcb);
-        prg.Fill(absl::MakeSpan(seeds[idx].data(), kKeySize));
-
-        curve25519_donna_basepoint(
-            static_cast<unsigned char*>(seeds_point[idx].data()),
-            seeds[idx].data());
-      }
-    });
-  }
-
-  void InterpolatePolynomial(const std::vector<std::string>& items) {
-    items_hash = HashInputs(items);
-
-    std::vector<absl::string_view> poly_x(items_hash.size());
-    std::vector<absl::string_view> poly_y(items_hash.size());
-    std::vector<std::array<uint8_t, kKeySize>> poly_y_permutation(
-        items_hash.size());
-
-    for (size_t idx = 0; idx < items_hash.size(); idx++) {
-      poly_x[idx] = absl::string_view(items_hash[idx]);
-
-      // Ideal Permutation
-      aes_ecb->Encrypt(absl::MakeSpan(seeds_point[idx]),
-                       absl::MakeSpan(poly_y_permutation[idx]));
-
-      poly_y[idx] = absl::string_view(
-          reinterpret_cast<const char*>(poly_y_permutation[idx].data()),
-          kKeySize);
-    }
-
-    // ToDo: now use newton Polynomial Interpolation, need optimize to fft
-    //
-    polynomial_coeff =
-        ::psi::mini_psi::InterpolatePolynomial(poly_x, poly_y, prime256_str);
-  }
-
-  void SendPolynomialCoeff(
-      const std::shared_ptr<yacl::link::Context>& link_ctx) {
-    size_t batch_count = 0;
-
-    std::shared_ptr<IBasicBatchProvider> batch_provider =
-        std::make_shared<MemoryBatchProvider>(polynomial_coeff,
-                                              kEcdhPsiBatchSize);
-
-    while (true) {
-      PsiDataBatch batch;
-      // NOTE: we still need to send one batch even there is no data.
-      // This dummy batch is used to notify peer the end of data stream.
-      auto items = batch_provider->ReadNextBatch();
-      batch.is_last_batch = items.empty();
-      // Mask and Send this batch.
-      if (!items.empty()) {
-        batch.flatten_bytes.reserve(items.size() * kHashSize);
-
-        for (const auto& item : items) {
-          batch.flatten_bytes.append(item);
-        }
-      }
-      // Send x^a.
-      const auto tag = fmt::format("MINI-PSI:X^A:{}", batch_count);
-      link_ctx->SendAsyncThrottled(link_ctx->NextRank(), batch.Serialize(),
-                                   tag);
-      if (batch.is_last_batch) {
-        SPDLOG_INFO("Last batch triggered, batch_count={}", batch_count);
-        break;
-      }
-      batch_count++;
-    }
-  }
-
-  void RecvMaskedEvalValues(
-      const std::shared_ptr<yacl::link::Context>& link_ctx) {
-    size_t batch_count = 0;
-
-    yacl::link::RecvTimeoutGuard guard(link_ctx, kLinkRecvTimeout);
-    while (true) {
-      const auto tag = fmt::format("MINI-PSI:X^A^B:{}", batch_count);
-      PsiDataBatch masked_eval_batch =
-          PsiDataBatch::Deserialize(link_ctx->Recv(link_ctx->NextRank(), tag));
-      // Fetch y^b.
-      YACL_ENFORCE(
-          masked_eval_batch.flatten_bytes.size() % kFinalCompareBytes == 0);
-      size_t num_items =
-          masked_eval_batch.flatten_bytes.size() / kFinalCompareBytes;
-
-      if (num_items > 0) {
-        absl::string_view flatten_bytes = masked_eval_batch.flatten_bytes;
-
-        for (size_t i = 0; i < num_items; ++i) {
-          peer_masked_values.emplace(
-              flatten_bytes.substr(i * kFinalCompareBytes, kFinalCompareBytes));
-        }
-      }
-      if (masked_eval_batch.is_last_batch) {
-        break;
-      }
-      batch_count++;
-    }
-  }
-
-  void MaskPeerPublicKey(const std::vector<std::string>& items) {
-    masked_values.resize(seeds.size());
-
-    yacl::parallel_for(0, seeds.size(), [&](int64_t begin, int64_t end) {
-      for (int64_t idx = begin; idx < end; ++idx) {
-        std::string masked(kKeySize, '\0');
-        curve25519_donna(reinterpret_cast<unsigned char*>(masked.data()),
-                         seeds[idx].data(), peer_public_key.data());
-
-        yacl::crypto::Sha256Hash sha256;
-        sha256.Update(items[idx].data());
-        sha256.Update(masked.data());
-        std::vector<uint8_t> mask_hash = sha256.CumulativeHash();
-        masked_values[idx].resize(kFinalCompareBytes);
-        std::memcpy(masked_values[idx].data(), mask_hash.data(),
-                    kFinalCompareBytes);
-      }
-    });
-  }
-
-  std::vector<std::string> GetIntersection(
-      const std::vector<std::string>& items) {
-    std::vector<std::string> ret;
-
-    for (uint32_t index = 0; index < masked_values.size(); index++) {
-      if (peer_masked_values.find(masked_values[index]) !=
-          peer_masked_values.end()) {
-        ret.push_back(items[index]);
-      }
-    }
-
-    return ret;
-  }
-
-  std::vector<std::array<uint8_t, kKeySize>> seeds;
-  std::vector<std::array<uint8_t, kKeySize>> seeds_point;
-
-  // peer's public key
-  std::array<uint8_t, kKeySize> peer_public_key;
-
-  // next prime over 2^256
-  std::string prime256_str;
-
-  // hash of items
-  std::vector<std::string> items_hash;
-
-  // polynomial_coeff
-  std::vector<std::string> polynomial_coeff;
-
-  // dual mask value
-  std::vector<std::string> masked_values;
-  // peer's dual mask value
-  std::unordered_set<std::string> peer_masked_values;
-
-  // use aes-128-ecb as Ideal Permutation
-  std::shared_ptr<yacl::crypto::SymmetricCrypto> aes_ecb;
-};
-
-}  // namespace
-
-// #define DEBUG_OUT
-
-void MiniPsiSend(const std::shared_ptr<yacl::link::Context>& link_ctx,
-                 const std::vector<std::string>& items) {
-  MiniPsiSendCtx send_ctx;
-
-  //
-  // TODO: whether use zk to prove sender's public_key
-  //    https://github.com/osu-crypto/MiniPSI/blob/master/libPSI/MiniPSI/MiniSender.cpp#L601
-  //    MiniPSI code use zk prove public_key (discrete logarithm)
-  //    in the origin paper no use zk
-  //
-  link_ctx->SendAsyncThrottled(
-      link_ctx->NextRank(),
-      yacl::Buffer(send_ctx.public_key.data(), send_ctx.public_key.size()),
-      "MINI-PSI:X^A");
-
-  // receive Polynomial Coefficient
-  send_ctx.RecvPolynomialCoeff(link_ctx);
-
-  std::future<void> f_eval =
-      std::async([&] { send_ctx.EvalPolynomial(items); });
-
-  f_eval.get();
-
-  // send Polynomial evaluation and mask value to receiver
-  send_ctx.SendMaskedEvalValues(link_ctx);
-}
-
-std::vector<std::string> MiniPsiRecv(
-    const std::shared_ptr<yacl::link::Context>& link_ctx,
-    const std::vector<std::string>& items) {
-  MiniPsiRecvCtx recv_ctx;
-
-  std::future<void> f_get_pubkey = std::async([&] {
-    // receive sender's public key
-    yacl::Buffer buf =
-        link_ctx->Recv(link_ctx->NextRank(), fmt::format("MINI-PSI:X^A"));
-    std::memcpy(recv_ctx.peer_public_key.data(), buf.data(), buf.size());
-
-    uint128_t aes_key = yacl::crypto::Blake3_128(recv_ctx.peer_public_key);
-    recv_ctx.aes_ecb = std::make_shared<yacl::crypto::SymmetricCrypto>(
-        yacl::crypto::SymmetricCrypto::CryptoType::AES128_ECB, aes_key, 0);
-  });
-
-  std::future<void> f_gen_seeds = std::async([&] {
-    // generate seed
-    recv_ctx.GenerateSeeds(items.size());
-  });
-  f_get_pubkey.get();
-  f_gen_seeds.get();
-
-  std::future<void> f_interpolate =
-      std::async([&] { recv_ctx.InterpolatePolynomial(items); });
-
-  f_interpolate.get();
-
-  // send polynomial coefficient to sender
-  recv_ctx.SendPolynomialCoeff(link_ctx);
-
-  std::future<void> f_mask_peer =
-      std::async([&] { return recv_ctx.MaskPeerPublicKey(items); });
-
-  f_mask_peer.get();
-
-  // get sender's masked value
-  recv_ctx.RecvMaskedEvalValues(link_ctx);
-
-  // get intersection
-  return recv_ctx.GetIntersection(items);
-}
-
-// big data
-void MiniPsiSendBatch(const std::shared_ptr<yacl::link::Context>& link_ctx,
-                      const std::vector<std::string>& items) {
-  size_t peer_size = utils::DeserializeSize(
-      link_ctx->Recv(link_ctx->NextRank(), fmt::format("RECV PEER SIZE")));
-
-  CuckooIndex::Options option = CuckooIndex::SelectParams(peer_size, 0, 3);
-
-  size_t num_bins = option.NumBins();
-  std::vector<std::string> items_hash = HashInputs(items);
-  //
-  std::vector<std::vector<std::string>> simple_hash(num_bins);
-
-  //
-  for (size_t idx = 0; idx < items.size(); idx++) {
-    uint128_t items_u128;
-    std::memcpy(&items_u128, items_hash[idx].data(), sizeof(uint128_t));
-    CuckooIndex::HashRoom itemHash(items_u128);
-    uint64_t bin_idx0 = itemHash.GetHash(0) % num_bins;
-    uint64_t bin_idx1 = itemHash.GetHash(1) % num_bins;
-    uint64_t bin_idx2 = itemHash.GetHash(2) % num_bins;
-    std::set<uint64_t> bin_idx_set;
-    bin_idx_set.insert(bin_idx0);
-    bin_idx_set.insert(bin_idx1);
-    bin_idx_set.insert(bin_idx2);
-
-    for (const auto& idx_inter : bin_idx_set) {
-      simple_hash[idx_inter].push_back(items[idx]);
-    }
-  }
-  size_t nthread = utils::DeserializeSize(link_ctx->Recv(
-      link_ctx->NextRank(), fmt::format("Mini-PSI RECV THREAD Num")));
-
-  auto thread = [&](const std::shared_ptr<yacl::link::Context>& thread_link_ctx,
-                    size_t thread_idx) {
-    size_t start_idx = num_bins * thread_idx / nthread;
-    size_t end_idx = num_bins * (thread_idx + 1) / nthread;
-
-    for (size_t idx = start_idx; idx < end_idx; idx += kCuckooHashBatchSize) {
-      size_t current_batch_size = std::min(kCuckooHashBatchSize, end_idx - idx);
-      std::set<std::string> batch_items_set;
-      std::vector<std::string> batch_items_vec;
-      for (size_t batch_idx = 0; batch_idx < current_batch_size; batch_idx++) {
-        for (auto& item : simple_hash[idx + batch_idx]) {
-          batch_items_set.insert(item);
-        }
-      }
-      batch_items_vec.assign(batch_items_set.begin(), batch_items_set.end());
-      SPDLOG_INFO("thread:{}, batch_idx:{}/{}, batch_items size:{} ",
-                  thread_idx, idx, end_idx, batch_items_vec.size());
-      MiniPsiSend(thread_link_ctx, batch_items_vec);
-    }
-  };
-
-  std::vector<std::future<void>> futures;
-  std::vector<std::shared_ptr<yacl::link::Context>> thread_link_ctxs(nthread);
-
-  for (size_t thread_idx = 0; thread_idx < nthread; ++thread_idx) {
-    thread_link_ctxs[thread_idx] = link_ctx->Spawn();
-    futures.push_back(
-        std::async(thread, thread_link_ctxs[thread_idx], thread_idx));
-  }
-  // wait thread
-  for (auto& f : futures) {
-    f.get();
-  }
-}
-
-std::vector<std::string> MiniPsiRecvBatch(
-    const std::shared_ptr<yacl::link::Context>& link_ctx,
-    const std::vector<std::string>& items) {
-  // send size to peer
-  link_ctx->SendAsyncThrottled(link_ctx->NextRank(),
-                               utils::SerializeSize(items.size()), "RECV SIZE");
-
-  std::vector<std::string> items_hash = HashInputs(items);
-  std::vector<uint128_t> items_hash_u128(items.size());
-
-  yacl::parallel_for(0, items.size(), [&](int64_t begin, int64_t end) {
-    for (int64_t idx = begin; idx < end; ++idx) {
-      std::memcpy(&items_hash_u128[idx], items_hash[idx].data(),
-                  sizeof(uint128_t));
-    }
-  });
-  // cuckoo hash
-  CuckooIndex::Options option = CuckooIndex::SelectParams(items.size(), 0, 3);
-  CuckooIndex cuckoo_index(option);
-
-  cuckoo_index.Insert(absl::MakeSpan(items_hash_u128));
-
-  YACL_ENFORCE(cuckoo_index.stash().empty(), "stash size not 0");
-
-  size_t nthreads = omp_get_max_threads();
-  // send thread num
-  if (items.size() < 100000) {
-    nthreads = 1;
-  } else {
-    nthreads /= 8;
-  }
-  link_ctx->Send(link_ctx->NextRank(), utils::SerializeSize(nthreads),
-                 "Mini-PSI SEND THREAD NUM");
-
-  size_t num_bins = option.NumBins();
-  auto ck_bins = cuckoo_index.bins();
-
-  std::vector<std::string> ret;
-  std::vector<size_t> ret_idx;
-  std::vector<std::vector<size_t>> ret_idx_vec(nthreads);
-
-  // thread func
-  auto thread = [&](const std::shared_ptr<yacl::link::Context>& thread_link_ctx,
-                    size_t thread_idx) {
-    size_t start_idx = num_bins * thread_idx / nthreads;
-    size_t end_idx = num_bins * (thread_idx + 1) / nthreads;
-
-    std::random_device rd;
-    yacl::crypto::Prg<uint64_t> prg(rd());
-
-    for (size_t idx = start_idx; idx < end_idx; idx += kCuckooHashBatchSize) {
-      size_t current_batch_size = std::min(kCuckooHashBatchSize, end_idx - idx);
-      std::vector<std::string> batch_items;
-      std::unordered_map<std::string, size_t> batch_map;
-      for (size_t batch_idx = 0; batch_idx < current_batch_size; batch_idx++) {
-        // real data
-        if (!ck_bins[idx + batch_idx].IsEmpty()) {
-          batch_items.push_back(items[ck_bins[idx + batch_idx].InputIdx()]);
-          batch_map.emplace(items[ck_bins[idx + batch_idx].InputIdx()],
-                            ck_bins[idx + batch_idx].InputIdx());
-        } else {
-          // insert padding data
-          std::string padding_data(kKeySize, '\0');
-          prg.Fill(absl::MakeSpan(padding_data.data(), padding_data.length()));
-          batch_items.push_back(padding_data);
-        }
-      }
-      SPDLOG_INFO("thread:{}, batch_idx:{}/{}, batch_items size:{} ",
-                  thread_idx, idx, end_idx, batch_items.size());
-
-      std::vector<std::string> intersection =
-          MiniPsiRecv(thread_link_ctx, batch_items);
-
-      for (auto& batch_idx : intersection) {
-        auto it = batch_map.find(batch_idx);
-        ret_idx_vec[thread_idx].push_back(it->second);
-      }
-    }
-    std::sort(ret_idx_vec[thread_idx].begin(), ret_idx_vec[thread_idx].end());
-  };
-
-  std::vector<std::future<void>> futures;
-  std::vector<std::shared_ptr<yacl::link::Context>> thread_link_ctxs(nthreads);
-
-  for (size_t thread_idx = 0; thread_idx < nthreads; ++thread_idx) {
-    thread_link_ctxs[thread_idx] = link_ctx->Spawn();
-    futures.push_back(
-        std::async(thread, thread_link_ctxs[thread_idx], thread_idx));
-  }
-  // wait thread
-  for (auto& f : futures) {
-    f.get();
-  }
-
-  for (size_t thread_idx = 0; thread_idx < nthreads; ++thread_idx) {
-    ret_idx.insert(ret_idx.begin(), ret_idx_vec[thread_idx].begin(),
-                   ret_idx_vec[thread_idx].end());
-  }
-  std::sort(ret_idx.begin(), ret_idx.end());
-
-  ret.reserve(ret_idx.size());
-  for (auto idx : ret_idx) {
-    ret.push_back(items[idx]);
-  }
-
-  return ret;
-}
-
-}  // namespace psi::mini_psi
diff --git a/psi/legacy/mini_psi/mini_psi.h b/psi/legacy/mini_psi/mini_psi.h
deleted file mode 100644
index e8b0343..0000000
--- a/psi/legacy/mini_psi/mini_psi.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2022 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "absl/types/span.h"
-#include "yacl/link/link.h"
-
-namespace psi::mini_psi {
-
-//
-// Compact and Malicious Private Set Intersection for Small Sets
-// https://eprint.iacr.org/2021/1159.pdf
-// opensource code
-// https://github.com/osu-crypto/Mini-PSI
-//
-void MiniPsiSend(const std::shared_ptr<yacl::link::Context>& link_ctx,
-                 const std::vector<std::string>& items);
-
-std::vector<std::string> MiniPsiRecv(
-    const std::shared_ptr<yacl::link::Context>& link_ctx,
-    const std::vector<std::string>& items);
-
-// use cuckoo hash to batch process
-void MiniPsiSendBatch(const std::shared_ptr<yacl::link::Context>& link_ctx,
-                      const std::vector<std::string>& items);
-
-std::vector<std::string> MiniPsiRecvBatch(
-    const std::shared_ptr<yacl::link::Context>& link_ctx,
-    const std::vector<std::string>& items);
-
-}  // namespace psi::mini_psi
diff --git a/psi/legacy/mini_psi/mini_psi_demo.cc b/psi/legacy/mini_psi/mini_psi_demo.cc
deleted file mode 100644
index 4a4af88..0000000
--- a/psi/legacy/mini_psi/mini_psi_demo.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-// Copyright 2022 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <exception>
-#include <fstream>
-#include <iostream>
-#include <iterator>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "gflags/gflags.h"
-#include "spdlog/spdlog.h"
-
-#include "psi/ecdh/ecdh_psi.h"
-#include "psi/legacy/mini_psi/mini_psi.h"
-
-DEFINE_int32(role, 1, "sender:0, receiver: 1");
-DEFINE_int32(rank, 0, "self rank 0/1");
-DEFINE_string(in, "in.csv", "input file");
-DEFINE_string(out, "out.csv", "psi out file");
-DEFINE_string(id, "id", "id of the csv");
-DEFINE_string(local, "127.0.0.1:1234", "local address and port");
-DEFINE_string(remote, "127.0.0.1:1235", "remote address and port");
-DEFINE_string(protocol, "semi-honest", "semi-honest/malicious");
-
-namespace {
-constexpr uint32_t kLinkRecvTimeout = 30 * 60 * 1000;
-
-class CSVRow {
- public:
-  std::string_view operator[](std::size_t index) const {
-    return std::string_view(&m_line_[m_data_[index] + 1],
-                            m_data_[index + 1] - (m_data_[index] + 1));
-  }
-  std::size_t size() const { return m_data_.size() - 1; }
-  void readNextRow(std::istream& str) {
-    std::getline(str, m_line_);
-
-    m_data_.clear();
-    m_data_.emplace_back(-1);
-    std::string::size_type pos = 0;
-    while ((pos = m_line_.find(',', pos)) != std::string::npos) {
-      m_data_.emplace_back(pos);
-      ++pos;
-    }
-    // This checks for a trailing comma with no data after it.
-    pos = m_line_.size();
-    m_data_.emplace_back(pos);
-  }
-
- private:
-  std::string m_line_;
-  std::vector<int> m_data_;
-};
-
-std::istream& operator>>(std::istream& str, CSVRow& data) {
-  data.readNextRow(str);
-  return str;
-}
-
-std::vector<std::string> ReadCsvData(const std::string& file_name) {
-  std::vector<std::string> items;
-  std::ifstream file(file_name);
-
-  CSVRow row;
-  // read header
-  file >> row;
-  while (file >> row) {
-    items.emplace_back(row[0]);
-  }
-  return items;
-}
-
-void WriteCsvData(const std::string& file_name,
-                  const std::vector<std::string>& items) {
-  std::ofstream out_file;
-  out_file.open(file_name, std::ios::out);
-  out_file << "id" << '\r' << std::endl;
-  for (const auto& item : items) {
-    out_file << item << '\r' << std::endl;
-  }
-  out_file.close();
-}
-
-std::shared_ptr<yacl::link::Context> CreateContext(
-    int self_rank, yacl::link::ContextDesc& lctx_desc) {
-  std::shared_ptr<yacl::link::Context> link_ctx;
-
-  yacl::link::FactoryBrpc factory;
-  link_ctx = factory.CreateContext(lctx_desc, self_rank);
-  link_ctx->ConnectToMesh();
-
-  return link_ctx;
-}
-
-std::shared_ptr<yacl::link::Context> CreateLinks(const std::string& local_addr,
-                                                 const std::string& remote_addr,
-                                                 int self_rank) {
-  yacl::link::ContextDesc lctx_desc;
-
-  // int self_rank = 0;
-
-  if (self_rank == 0) {
-    std::string id = fmt::format("party{}", 0);
-    lctx_desc.parties.push_back({id, local_addr});
-    id = fmt::format("party{}", 1);
-    lctx_desc.parties.push_back({id, remote_addr});
-  } else {
-    std::string id = fmt::format("party{}", 0);
-    lctx_desc.parties.push_back({id, remote_addr});
-    id = fmt::format("party{}", 1);
-    lctx_desc.parties.push_back({id, local_addr});
-  }
-
-  return CreateContext(self_rank, lctx_desc);
-}
-
-}  // namespace
-
-// psi demo
-//  -- sender
-// ./bazel-bin/psi/psi/ecdh/mini_psi_demo --in ./100m/psi_1.csv --local
-// "127.0.0.1:1234" --remote "127.0.0.1:2222" --rank 0 --role 0 --protocol
-// semi-honest
-//
-//  -- receiver
-// ./bazel-bin/psi/psi/ecdh/mini_psi_demo --in ./100m/psi_1.csv --local
-// "127.0.0.1:1234" --remote "127.0.0.1:2222" --rank 1 --role 1 --protocol
-// semi-honest
-//
-int main(int argc, char** argv) {
-  gflags::AllowCommandLineReparsing();
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  std::cout << FLAGS_role << "," << FLAGS_rank << std::endl;
-  std::cout << FLAGS_in << "," << FLAGS_out << std::endl;
-  std::cout << FLAGS_local << "," << FLAGS_remote << std::endl;
-  std::cout << FLAGS_protocol << "," << FLAGS_id << std::endl;
-
-  std::vector<std::string> items = ReadCsvData(FLAGS_in);
-  std::cout << items.size() << std::endl;
-
-  try {
-    std::shared_ptr<yacl::link::Context> link_ctx =
-        CreateLinks(FLAGS_in, FLAGS_remote, FLAGS_rank);
-    link_ctx->SetRecvTimeout(kLinkRecvTimeout);
-
-    std::string file_name = FLAGS_protocol;
-    file_name.append("_").append(FLAGS_out);
-
-    std::vector<std::string> intersection;
-    if (FLAGS_protocol == "semi-honest") {
-      intersection = psi::ecdh::RunEcdhPsi(link_ctx, items, 1);
-      if (FLAGS_rank == 1) {
-        SPDLOG_INFO("intersection size:{}", intersection.size());
-
-        WriteCsvData(file_name, intersection);
-      }
-    } else if (FLAGS_protocol == "malicious") {
-      if (FLAGS_role == 0) {
-        psi::mini_psi::MiniPsiSendBatch(link_ctx, items);
-      } else if (FLAGS_role == 1) {
-        intersection = psi::mini_psi::MiniPsiRecvBatch(link_ctx, items);
-        SPDLOG_INFO("intersection size:{}", intersection.size());
-        WriteCsvData(file_name, intersection);
-      }
-    }
-  } catch (std::exception& e) {
-    SPDLOG_INFO("exception {}", e.what());
-  }
-
-  return 0;
-}
diff --git a/psi/legacy/mini_psi/mini_psi_test.cc b/psi/legacy/mini_psi/mini_psi_test.cc
deleted file mode 100644
index e05489c..0000000
--- a/psi/legacy/mini_psi/mini_psi_test.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright 2022 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "psi/legacy/mini_psi/mini_psi.h"
-
-#include <future>
-#include <iostream>
-#include <random>
-#include <vector>
-
-#include "absl/strings/str_split.h"
-#include "gtest/gtest.h"
-#include "spdlog/spdlog.h"
-#include "yacl/base/exception.h"
-#include "yacl/link/test_util.h"
-
-#include "psi/utils/test_utils.h"
-
-struct TestParams {
-  std::vector<std::string> items_a;
-  std::vector<std::string> items_b;
-  bool batch = false;
-};
-
-namespace psi::mini_psi {
-class MiniPsiTest : public testing::TestWithParam<TestParams> {};
-
-TEST_P(MiniPsiTest, Works) {
-  auto params = GetParam();
-
-  auto link_ab = yacl::link::test::SetupWorld("mini", 2);
-
-  auto intersection_std_ab =
-      test::GetIntersection(params.items_a, params.items_b);
-
-  std::future<void> f_send;
-  std::future<std::vector<std::string>> f_recv;
-  if (!params.batch) {
-    f_send =
-        std::async([&] { return MiniPsiSend(link_ab[0], params.items_a); });
-
-    f_recv =
-        std::async([&] { return MiniPsiRecv(link_ab[1], params.items_b); });
-  } else {
-    f_send = std::async(
-        [&] { return MiniPsiSendBatch(link_ab[0], params.items_a); });
-
-    f_recv = std::async(
-        [&] { return MiniPsiRecvBatch(link_ab[1], params.items_b); });
-  }
-  f_send.get();
-  auto intersection = f_recv.get();
-
-  SPDLOG_INFO("{}:{}, intersection.size():{}", __func__, __LINE__,
-              intersection.size());
-
-  EXPECT_EQ(intersection, intersection_std_ab);
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    Works_Instances, MiniPsiTest,
-    testing::Values(TestParams{{"a", "b"}, {"b", "c"}},  //
-                    TestParams{{"a", "b"}, {"c", "d"}},  //
-                    //
-                    TestParams{{}, {"a"}},  //
-                    //
-                    TestParams{{"a"}, {}},  //
-                    // less than one batch
-                    TestParams{test::CreateRangeItems(0, 1800),
-                               test::CreateRangeItems(1, 1800), false},
-                    // exactly one batch
-                    TestParams{test::CreateRangeItems(0, 2000),
-                               test::CreateRangeItems(1, 2000), true},  //
-                    //
-                    TestParams{{}, {}}  //
-                    ));
-}  // namespace psi::mini_psi
diff --git a/psi/legacy/mini_psi/polynomial.cc b/psi/legacy/mini_psi/polynomial.cc
deleted file mode 100644
index 2241446..0000000
--- a/psi/legacy/mini_psi/polynomial.cc
+++ /dev/null
@@ -1,240 +0,0 @@
-// Copyright 2022 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "psi/legacy/mini_psi/polynomial.h"
-
-#include <algorithm>
-#include <memory>
-
-#include "openssl/bn.h"
-#include "yacl/base/exception.h"
-
-namespace psi::mini_psi {
-
-namespace {
-class BNDeleter {
- public:
-  void operator()(BIGNUM *bn) { BN_free(bn); }
-};
-using BigNumPtr = std::unique_ptr<BIGNUM, BNDeleter>;
-
-BigNumPtr GetBigNumPtr(int v) {
-  BIGNUM *bn = BN_new();
-  BN_set_word(bn, v);
-  return BigNumPtr(bn);
-}
-
-BigNumPtr GetBigNumPtr(std::string_view v) {
-  BIGNUM *bn = BN_bin2bn(reinterpret_cast<const uint8_t *>(v.data()),
-                         v.length(), nullptr);
-  YACL_ENFORCE(bn != nullptr);
-  return BigNumPtr(bn);
-}
-
-std::vector<BigNumPtr> GetBigNumPtrVector(size_t data_size, int value = 0) {
-  std::vector<BigNumPtr> res(data_size);
-
-  for (size_t idx = 0; idx < data_size; idx++) {
-    BIGNUM *bn = BN_new();
-    BN_set_word(bn, value);
-    res[idx] = BigNumPtr(bn);
-  }
-
-  return res;
-}
-
-std::vector<BigNumPtr> GetBigNumPtrVector(
-    const std::vector<absl::string_view> &data_vec) {
-  std::vector<BigNumPtr> res(data_vec.size());
-
-  for (size_t idx = 0; idx < data_vec.size(); idx++) {
-    BIGNUM *bn =
-        BN_bin2bn(reinterpret_cast<const uint8_t *>(data_vec[idx].data()),
-                  data_vec[idx].length(), nullptr);
-    YACL_ENFORCE(bn != nullptr);
-    res[idx] = BigNumPtr(bn);
-  }
-
-  return res;
-}
-}  // namespace
-
-std::string EvalPolynomial(const std::vector<absl::string_view> &coeff,
-                           absl::string_view poly_x, std::string_view p_str) {
-  BigNumPtr acc = GetBigNumPtr(0);
-  BigNumPtr bn_x = GetBigNumPtr(poly_x);
-  BigNumPtr bn_p = GetBigNumPtr(p_str);
-
-  BN_CTX *bn_ctx = BN_CTX_new();
-  for (int64_t i = coeff.size() - 1; i >= 0; i--) {
-    // acc = acc * X;         // mul(acc, acc, a);
-    // acc = acc + coeff[i];  // add(acc, acc, f.rep[i]);
-    BigNumPtr bn_coeff = GetBigNumPtr(coeff[i]);
-    BN_mod_mul(acc.get(), acc.get(), bn_x.get(), bn_p.get(), bn_ctx);
-    BN_mod_add(acc.get(), acc.get(), bn_coeff.get(), bn_p.get(), bn_ctx);
-  }
-
-  BN_CTX_free(bn_ctx);
-
-  std::string res;
-  int len =
-      std::max(BN_num_bytes(acc.get()), static_cast<int>(poly_x.length()));
-  res.resize(len);
-
-  BN_bn2binpad(acc.get(), reinterpret_cast<unsigned char *>(res.data()), len);
-
-  return res;
-}
-
-std::string EvalPolynomial(const std::vector<std::string> &coeff,
-                           absl::string_view poly_x, std::string_view p_str) {
-  std::vector<absl::string_view> coeff2(coeff.size());
-
-  for (size_t idx = 0; idx < coeff.size(); idx++) {
-    coeff2[idx] = absl::string_view(coeff[idx]);
-  }
-
-  return EvalPolynomial(coeff2, poly_x, p_str);
-}
-
-std::vector<std::string> EvalPolynomial(
-    const std::vector<absl::string_view> &coeff,
-    const std::vector<absl::string_view> &poly_x, std::string_view p_str) {
-  std::vector<std::string> res(poly_x.size());
-
-  for (size_t idx = 0; idx < poly_x.size(); idx++) {
-    res[idx] = EvalPolynomial(coeff, poly_x[idx], p_str);
-  }
-  return res;
-}
-
-std::vector<std::string> InterpolatePolynomial(
-    const std::vector<absl::string_view> &poly_x,
-    const std::vector<absl::string_view> &poly_y, std::string_view p_str) {
-  int64_t m = poly_x.size();
-
-  YACL_ENFORCE(poly_y.size() == poly_x.size());
-
-  BigNumPtr bn_p = GetBigNumPtr(p_str);
-
-  // std::vector<MersennePrime> prod(X);
-  std::vector<BigNumPtr> prod = GetBigNumPtrVector(poly_x);
-
-  BigNumPtr t1 = GetBigNumPtr(0);
-  BigNumPtr t2 = GetBigNumPtr(0);
-
-  int64_t k;
-  int64_t i;
-
-  std::vector<BigNumPtr> bn_x = GetBigNumPtrVector(poly_x);
-  std::vector<BigNumPtr> bn_y = GetBigNumPtrVector(poly_y);
-  std::vector<BigNumPtr> res_bn = GetBigNumPtrVector(m);
-
-  BN_CTX *bn_ctx = BN_CTX_new();
-
-  for (k = 0; k < m; k++) {
-    // const MersennePrime &aa = X[k];
-    const BigNumPtr aa = GetBigNumPtr(poly_x[k]);
-
-    // t1 = (uint64_t)1;
-    BN_one(t1.get());
-    for (i = k - 1; i >= 0; i--) {
-      // t1 = t1 * aa;       // mul(t1, t1, aa);
-      // t1 = t1 + prod[i];  // add(t1, t1, prod[i]);
-      BN_mod_mul(t1.get(), t1.get(), aa.get(), bn_p.get(), bn_ctx);
-
-      BN_mod_add(t1.get(), t1.get(), prod[i].get(), bn_p.get(), bn_ctx);
-    }
-
-    // t2 = (uint64_t)0;  // clear(t2);
-    BN_zero(t2.get());
-    for (i = k - 1; i >= 0; i--) {
-      // t2 = t2 * aa;         // mul(t2, t2, aa);
-      // t2 = t2 + res_bn[i];  // add(t2, t2, res[i]);
-      BN_mod_mul(t2.get(), t2.get(), aa.get(), bn_p.get(), bn_ctx);
-
-      BN_mod_add(t2.get(), t2.get(), res_bn[i].get(), bn_p.get(), bn_ctx);
-    }
-
-    // t1 = one / t1;   // inv(t1, t1);
-    // t2 = Y[k] - t2;  // sub(t2, b[k], t2);
-    // t1 = t1 * t2;    // mul(t1, t1, t2);
-
-    BN_mod_inverse(t1.get(), t1.get(), bn_p.get(), bn_ctx);
-
-    BN_mod_sub(t2.get(), bn_y[k].get(), t2.get(), bn_p.get(), bn_ctx);
-
-    BN_mod_mul(t1.get(), t1.get(), t2.get(), bn_p.get(), bn_ctx);
-
-    for (i = 0; i < k; i++) {
-      // t2 = prod[i] * t1;           // mul(t2, prod[i], t1);
-      // res_bn[i] = res_bn[i] + t2;  // add(res[i], res[i], t2);
-      BN_mod_mul(t2.get(), prod[i].get(), t1.get(), bn_p.get(), bn_ctx);
-
-      BN_mod_add(res_bn[i].get(), res_bn[i].get(), t2.get(), bn_p.get(),
-                 bn_ctx);
-    }
-
-    // res_bn[k] = t1;
-    BN_copy(res_bn[k].get(), t1.get());
-
-    if (k < m - 1) {
-      if (k == 0) {
-        // prod[0] = p - prod[0];
-        BN_mod_sub(prod[0].get(), bn_p.get(), prod[0].get(), bn_p.get(),
-                   bn_ctx);
-
-      } else {
-        // t1 = p - X[k];
-        BN_mod_sub(t1.get(), bn_p.get(), bn_x[k].get(), bn_p.get(), bn_ctx);
-
-        // prod[k] = t1 + prod[k - 1];  // add(prod[k], t1, prod[k-1]);
-        BN_mod_add(prod[k].get(), t1.get(), prod[k - 1].get(), bn_p.get(),
-                   bn_ctx);
-
-        for (i = k - 1; i >= 1; i--) {
-          // t2 = prod[i] * t1;           // mul(t2, prod[i], t1);
-          // prod[i] = t2 + prod[i - 1];  // add(prod[i], t2, prod[i-1]);
-          BN_mod_mul(t2.get(), prod[i].get(), t1.get(), bn_p.get(), bn_ctx);
-
-          BN_mod_add(prod[i].get(), t2.get(), prod[i - 1].get(), bn_p.get(),
-                     bn_ctx);
-        }
-        // prod[0] = prod[0] * t1;  // mul(prod[0], prod[0], t1);
-        BN_mod_mul(prod[0].get(), prod[0].get(), t1.get(), bn_p.get(), bn_ctx);
-      }
-    }
-  }
-
-  BN_CTX_free(bn_ctx);
-  // while (m > 0 && !(res_bn[m - 1] != zero)) m--;
-  while ((m > 0) && (BN_is_zero(res_bn[m - 1].get()) != 0)) {
-    m--;
-  }
-
-  res_bn.resize(m);
-
-  std::vector<std::string> res(m);
-  for (int64_t idx = 0; idx < m; idx++) {
-    int len = std::max(BN_num_bytes(res_bn[idx].get()),
-                       static_cast<int>(poly_y[0].length()));
-    res[idx].resize(len);
-    BN_bn2binpad(res_bn[idx].get(),
-                 reinterpret_cast<unsigned char *>(res[idx].data()), len);
-  }
-
-  return res;
-}
-
-}  // namespace psi::mini_psi
diff --git a/psi/legacy/mini_psi/polynomial.h b/psi/legacy/mini_psi/polynomial.h
deleted file mode 100644
index 9059622..0000000
--- a/psi/legacy/mini_psi/polynomial.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2022 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <string>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-
-namespace psi::mini_psi {
-
-// for big num
-std::string EvalPolynomial(const std::vector<absl::string_view> &coeff,
-                           absl::string_view X, std::string_view p);
-
-std::string EvalPolynomial(const std::vector<std::string> &coeff,
-                           absl::string_view X, std::string_view p);
-
-std::vector<std::string> EvalPolynomial(
-    const std::vector<absl::string_view> &coeff,
-    const std::vector<absl::string_view> &poly_x, std::string_view p_str);
-
-std::vector<std::string> EvalPolynomial(
-    const std::vector<std::string> &coeff,
-    const std::vector<absl::string_view> &poly_x, std::string_view p_str);
-
-std::vector<std::string> InterpolatePolynomial(
-    const std::vector<absl::string_view> &poly_x,
-    const std::vector<absl::string_view> &poly_y, std::string_view p_str);
-
-}  // namespace psi::mini_psi
diff --git a/psi/legacy/mini_psi/polynomial_test.cc b/psi/legacy/mini_psi/polynomial_test.cc
deleted file mode 100644
index 065fd04..0000000
--- a/psi/legacy/mini_psi/polynomial_test.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright 2022 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "psi/legacy/mini_psi/polynomial.h"
-
-#include <future>
-#include <iostream>
-#include <random>
-
-#include "absl/strings/escaping.h"
-#include "gtest/gtest.h"
-#include "yacl/crypto/tools/prg.h"
-
-namespace {
-struct TestParams {
-  uint64_t polynomial_order;
-};
-
-// first prime over 2^256, used as module for polynoimal interpolate
-std::string kPrimeOver256bHexStr =
-    "010000000000000000000000000000000000000000000000000000000000000129";
-constexpr uint32_t kBnByteSize = 32;
-
-}  // namespace
-
-namespace psi::mini_psi {
-//  test 256b big num polynomial interpolate and eval
-class PolynomialBnTest : public testing::TestWithParam<TestParams> {};
-
-TEST_P(PolynomialBnTest, Works) {
-  auto params = GetParam();
-  uint64_t polynomial_order = params.polynomial_order;
-
-  std::vector<std::string> poly_x(polynomial_order);
-  std::vector<std::string> poly_y(polynomial_order);
-  std::vector<absl::string_view> poly_x_sv(polynomial_order);
-  std::vector<absl::string_view> poly_y_sv(polynomial_order);
-  std::vector<std::string> coeff;
-
-  std::random_device rd;
-  yacl::crypto::Prg<uint64_t> prg1(rd());
-  yacl::crypto::Prg<uint64_t> prg2(rd());
-
-  std::string prime_data;
-
-  EXPECT_TRUE(absl::HexStringToBytes(kPrimeOver256bHexStr, &prime_data));
-
-  for (uint64_t i = 0; i < polynomial_order; ++i) {
-    poly_x[i].resize(kBnByteSize);
-    poly_y[i].resize(kBnByteSize);
-    prg1.Fill(absl::MakeSpan(poly_x[i].data(), kBnByteSize));
-    prg2.Fill(absl::MakeSpan(poly_y[i].data(), kBnByteSize));
-
-    poly_x_sv[i] = poly_x[i];
-    poly_y_sv[i] = poly_y[i];
-  }
-
-  coeff = InterpolatePolynomial(poly_x_sv, poly_y_sv, prime_data);
-
-  for (uint64_t i = 0; i < polynomial_order; ++i) {
-    std::string eval_y = EvalPolynomial(coeff, poly_x[i], prime_data);
-    EXPECT_EQ(eval_y, poly_y[i]);
-  }
-}
-
-INSTANTIATE_TEST_SUITE_P(Works_Instances, PolynomialBnTest,
-                         testing::Values(TestParams{0},     //
-                                         TestParams{1},     //
-                                         TestParams{10},    //
-                                         TestParams{32},    //
-                                         TestParams{128},   //
-                                         TestParams{256},   //
-                                         TestParams{1024},  //
-                                         TestParams{1025}   //
-                                         ));
-
-}  // namespace psi::mini_psi
diff --git a/psi/legacy/nparty_psi.h b/psi/legacy/nparty_psi.h
index fda43e4..4cdce43 100644
--- a/psi/legacy/nparty_psi.h
+++ b/psi/legacy/nparty_psi.h
@@ -19,7 +19,7 @@
 
 #include "yacl/link/link.h"
 
-#include "psi/ecdh/ecdh_psi.h"
+#include "psi/algorithm/ecdh/ecdh_psi.h"
 #include "psi/legacy/base_operator.h"
 
 namespace psi {
@@ -81,4 +81,4 @@ class NpartyPsiOperator : public PsiBaseOperator {
   Options options_;
 };
 
-}  // namespace psi
\ No newline at end of file
+}  // namespace psi
diff --git a/psi/legacy/rr22_2party_psi.cc b/psi/legacy/rr22_2party_psi.cc
deleted file mode 100644
index 628bcb6..0000000
--- a/psi/legacy/rr22_2party_psi.cc
+++ /dev/null
@@ -1,922 +0,0 @@
-// Copyright 2023 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "psi/legacy/rr22_2party_psi.h"
-
-#include <chrono>
-#include <future>
-
-#include "omp.h"
-#include "sparsehash/dense_hash_map"
-#include "yacl/crypto/hash/hash_utils.h"
-#include "yacl/utils/parallel.h"
-
-#include "psi/legacy/factory.h"
-#include "psi/rr22/davis_meyer_hash.h"
-#include "psi/rr22/okvs/galois128.h"
-#include "psi/utils/sync.h"
-
-using DurationMillis = std::chrono::duration<double, std::milli>;
-
-namespace psi {
-
-namespace {
-
-struct NoHash {
-  inline size_t operator()(const uint128_t& v) const {
-    uint32_t v32;
-    std::memcpy(&v32, &v, sizeof(uint32_t));
-
-    return v32;
-  }
-};
-
-size_t ComputeTruncateSize(size_t self_size, size_t peer_size, size_t ssp,
-                           bool malicious) {
-  size_t truncate_size =
-      malicious
-          ? sizeof(uint128_t)
-          : std::min<size_t>(
-                std::ceil((ssp + std::ceil(std::log2(self_size * peer_size))) /
-                          8),
-                sizeof(uint128_t));
-
-  return truncate_size;
-}
-
-constexpr size_t kRr22OprfBinSize = 1 << 14;
-constexpr uint128_t kAesHashSeed =
-    yacl::MakeUint128(0x99e096a63468f39f, 0x9ceaad9f20cc8233);
-
-constexpr size_t kPaxosWeight = 3;
-
-}  // namespace
-
-void Rr22OprfSender::Send(const std::shared_ptr<yacl::link::Context>& lctx,
-                          size_t paxos_init_size,
-                          absl::Span<const uint128_t> inputs,
-                          absl::Span<uint128_t> hash_outputs,
-                          [[maybe_unused]] size_t num_threads) {
-  if (mode_ == rr22::Rr22PsiMode::FastMode) {
-    SendFast(lctx, paxos_init_size, inputs, hash_outputs, num_threads);
-  } else if (mode_ == rr22::Rr22PsiMode::LowCommMode) {
-    SendLowComm(lctx, paxos_init_size, inputs, hash_outputs, num_threads);
-  }
-}
-
-void Rr22OprfSender::SendFast(const std::shared_ptr<yacl::link::Context>& lctx,
-                              size_t paxos_init_size,
-                              absl::Span<const uint128_t> inputs,
-                              absl::Span<uint128_t> hash_outputs,
-                              [[maybe_unused]] size_t num_threads) {
-  uint128_t baxos_seed;
-  SPDLOG_INFO("recv paxos seed...");
-
-  yacl::Buffer baxos_seed_buf =
-      lctx->Recv(lctx->NextRank(), fmt::format("recv paxos seed"));
-  YACL_ENFORCE(baxos_seed_buf.size() == sizeof(uint128_t));
-
-  SPDLOG_INFO("recv paxos seed finished");
-
-  std::memcpy(&baxos_seed, baxos_seed_buf.data(), baxos_seed_buf.size());
-
-  baxos_.Init(paxos_init_size, bin_size_, kPaxosWeight, ssp_,
-              rr22::okvs::PaxosParam::DenseType::GF128, baxos_seed);
-
-  uint128_t ws = 0;
-  if (malicious_) {
-    SPDLOG_INFO("malicious version");
-    // generate ws
-    ws = yacl::crypto::SecureRandU128();
-    // send hash of ws to receiver
-    yacl::crypto::RandomOracle ro(yacl::crypto::HashAlgorithm::SHA256, 32);
-    yacl::Buffer ws_hash =
-        ro(yacl::ByteContainerView(&ws, sizeof(uint128_t)), 32);
-
-    lctx->SendAsyncThrottled(lctx->NextRank(), ws_hash,
-                             fmt::format("send ws_hash"));
-  }
-
-  paxos_size_ = baxos_.size();
-
-  // vole send function
-  yacl::crypto::SilentVoleSender vole_sender(code_type_, malicious_);
-
-  b_ = yacl::Buffer(std::max<size_t>(256, baxos_.size()) * sizeof(uint128_t));
-  std::memset(b_.data(), 0, b_.size());
-  absl::Span<uint128_t> b128_span =
-      absl::MakeSpan(reinterpret_cast<uint128_t*>(b_.data()),
-                     std::max<size_t>(256, baxos_.size()));
-
-  SPDLOG_INFO("begin vole send");
-
-  vole_sender.Send(lctx, b128_span);
-  delta_ = vole_sender.GetDelta();
-
-  SPDLOG_INFO("end vole send");
-
-  auto hash_inputs_proc =
-      std::async([&] { HashInputMulDelta(inputs, hash_outputs); });
-
-  if (malicious_) {
-    yacl::Buffer wr_buf = lctx->Recv(lctx->NextRank(), fmt::format("recv wr"));
-    YACL_ENFORCE(wr_buf.size() == sizeof(uint128_t));
-    std::memcpy(&w_, wr_buf.data(), wr_buf.size());
-
-    lctx->SendAsyncThrottled(lctx->NextRank(),
-                             yacl::ByteContainerView(&ws, sizeof(uint128_t)),
-                             fmt::format("recv ws"));
-
-    w_ = w_ ^ ws;
-  }
-
-  SPDLOG_INFO("recv paxos solve ...");
-
-  yacl::Buffer paxos_solve_buffer(paxos_size_ * sizeof(uint128_t));
-  std::memset(paxos_solve_buffer.data(), 0, paxos_solve_buffer.size());
-  absl::Span<uint128_t> paxos_solve_vec =
-      absl::MakeSpan((uint128_t*)paxos_solve_buffer.data(), paxos_size_);
-
-  size_t recv_item_count = 0;
-
-  while (true) {
-    yacl::Buffer paxos_solve_buf =
-        lctx->Recv(lctx->NextRank(), fmt::format("recv paxos_solve"));
-
-    std::memcpy(&paxos_solve_vec[recv_item_count], paxos_solve_buf.data(),
-                paxos_solve_buf.size());
-
-    recv_item_count += paxos_solve_buf.size() / sizeof(uint128_t);
-    if (recv_item_count == paxos_size_) {
-      break;
-    }
-  }
-
-  SPDLOG_INFO("recv paxos solve finished. bytes:{}",
-              paxos_solve_vec.size() * sizeof(uint128_t));
-
-  hash_inputs_proc.get();
-
-  rr22::okvs::Galois128 delta_gf128(delta_);
-
-  SPDLOG_INFO("begin b xor delta a");
-
-  yacl::parallel_for(
-      0, paxos_solve_vec.size(), [&](int64_t begin, int64_t end) {
-        for (int64_t idx = begin; idx < end; ++idx) {
-          b128_span[idx] =
-              b128_span[idx] ^
-              (delta_gf128 * paxos_solve_vec[idx]).get<uint128_t>(0);
-        }
-      });
-
-  SPDLOG_INFO("end b xor delta a");
-}
-
-void Rr22OprfSender::SendLowComm(
-    const std::shared_ptr<yacl::link::Context>& lctx, size_t paxos_init_size,
-    absl::Span<const uint128_t> inputs, absl::Span<uint128_t> hash_outputs,
-    [[maybe_unused]] size_t num_threads) {
-  uint128_t paxos_seed;
-  SPDLOG_INFO("recv paxos seed...");
-
-  yacl::Buffer paxos_seed_buf =
-      lctx->Recv(lctx->NextRank(), fmt::format("recv paxos seed"));
-  YACL_ENFORCE(paxos_seed_buf.size() == sizeof(uint128_t));
-
-  std::memcpy(&paxos_seed, paxos_seed_buf.data(), paxos_seed_buf.size());
-
-  paxos_.Init(paxos_init_size, kPaxosWeight, ssp_,
-              rr22::okvs::PaxosParam::DenseType::Binary, paxos_seed);
-
-  paxos_size_ = paxos_.size();
-
-  // vole send function
-
-  yacl::crypto::SilentVoleSender vole_sender(code_type_);
-
-  b_ = yacl::Buffer(std::max<size_t>(256, paxos_.size()) * sizeof(uint128_t));
-  std::memset(b_.data(), 0, b_.size());
-  absl::Span<uint128_t> b128_span =
-      absl::MakeSpan(reinterpret_cast<uint128_t*>(b_.data()),
-                     std::max<size_t>(256, paxos_.size()));
-
-  SPDLOG_INFO("begin vole send");
-
-  vole_sender.SfSend(lctx, b128_span);
-  delta_ = vole_sender.GetDelta();
-
-  SPDLOG_INFO("end vole send");
-
-  HashInputMulDelta(inputs, hash_outputs);
-
-  SPDLOG_INFO("recv paxos solve ...");
-  yacl::Buffer paxos_solve_buf =
-      lctx->Recv(lctx->NextRank(), fmt::format("recv paxos_solve"));
-  YACL_ENFORCE(paxos_solve_buf.size() / sizeof(uint64_t) == paxos_size_);
-
-  SPDLOG_INFO("recv paxos solve finished. bytes:{}", paxos_solve_buf.size());
-
-  absl::Span<uint64_t> paxos_solve_u64 =
-      absl::MakeSpan(reinterpret_cast<uint64_t*>(paxos_solve_buf.data()),
-                     paxos_solve_buf.size() / sizeof(uint64_t));
-
-  SPDLOG_INFO("paxos_solve_u64 size:{}", paxos_solve_u64.size());
-
-  rr22::okvs::Galois128 delta_gf128(delta_);
-
-  for (size_t i = 0; i < paxos_solve_u64.size(); ++i) {
-    // Delta * (A - P), note that here is GF64 * GF128 = GF128
-    b128_span[i] =
-        b128_span[i] ^ (delta_gf128 * paxos_solve_u64[i]).get<uint128_t>(0);
-  }
-}
-
-void Rr22OprfSender::Eval(absl::Span<const uint128_t> inputs,
-                          absl::Span<uint128_t> outputs, uint64_t num_threads) {
-  SPDLOG_INFO("paxos decode ...");
-
-  YACL_ENFORCE(b_.size() > 0, "Must use Send() first");
-
-  absl::Span<uint128_t> b128_span =
-      absl::MakeSpan(reinterpret_cast<uint128_t*>(b_.data()), paxos_size_);
-
-  if (mode_ == rr22::Rr22PsiMode::FastMode) {
-    baxos_.Decode(inputs, outputs, b128_span, num_threads);
-  } else if (mode_ == rr22::Rr22PsiMode::LowCommMode) {
-    paxos_.Decode(inputs, outputs, b128_span);
-  } else {
-    YACL_THROW("unsupported rr22 psi mode");
-  }
-
-  SPDLOG_INFO("paxos decode finished");
-
-  rr22::okvs::AesCrHash aes_crhash(kAesHashSeed);
-
-  rr22::okvs::Galois128 delta_gf128(delta_);
-
-  if (mode_ == rr22::Rr22PsiMode::FastMode) {
-    yacl::parallel_for(0, inputs.size(), [&](int64_t begin, int64_t end) {
-      for (int64_t idx = begin; idx < end; ++idx) {
-        uint128_t h = aes_crhash.Hash(inputs[idx]);
-        outputs[idx] = outputs[idx] ^ (delta_gf128 * h).get<uint128_t>(0);
-        if (malicious_) {
-          outputs[idx] = outputs[idx] ^ w_;
-        }
-      }
-    });
-  } else if (mode_ == rr22::Rr22PsiMode::LowCommMode) {
-    yacl::parallel_for(0, inputs.size(), [&](int64_t begin, int64_t end) {
-      for (int64_t idx = begin; idx < end; ++idx) {
-        uint64_t h =
-            yacl::DecomposeUInt128(aes_crhash.Hash(inputs[idx])).second;
-        // delta_gf128 * h is GF128 * GF64
-        outputs[idx] = outputs[idx] ^ (delta_gf128 * h).get<uint128_t>(0);
-      }
-    });
-  }
-  if (malicious_) {
-    rr22::DavisMeyerHash(outputs, inputs, outputs);
-  } else {
-    aes_crhash.Hash(outputs, outputs);
-  }
-}
-
-void Rr22OprfSender::HashInputMulDelta(absl::Span<const uint128_t> inputs,
-                                       absl::Span<uint128_t> hash_outputs) {
-  YACL_ENFORCE(hash_outputs.size() == inputs.size());
-
-  rr22::okvs::Galois128 delta_gf128(delta_);
-  rr22::okvs::AesCrHash aes_crhash(kAesHashSeed);
-
-  if (mode_ == rr22::Rr22PsiMode::FastMode) {
-    for (size_t i = 0; i < inputs.size(); ++i) {
-      hash_outputs[i] =
-          (delta_gf128 * aes_crhash.Hash(inputs[i])).get<uint128_t>(0);
-    }
-  } else if (mode_ == rr22::Rr22PsiMode::LowCommMode) {
-    for (size_t i = 0; i < inputs.size(); ++i) {
-      hash_outputs[i] =
-          (delta_gf128 *
-           yacl::DecomposeUInt128(aes_crhash.Hash(inputs[i])).second)
-              .get<uint128_t>(0);
-    }
-  } else {
-    YACL_THROW("unsupported rr22 psi mode");
-  }
-}
-
-void Rr22OprfSender::Eval(absl::Span<const uint128_t> inputs,
-                          absl::Span<const uint128_t> inputs_hash,
-                          absl::Span<uint128_t> outputs, uint64_t num_threads) {
-  YACL_ENFORCE(b_.size() > 0, "Must use Send() first");
-
-  absl::Span<uint128_t> b128_span =
-      absl::MakeSpan(reinterpret_cast<uint128_t*>(b_.data()), paxos_size_);
-
-  SPDLOG_INFO("paxos decode (mode:{}) ...",
-              mode_ == rr22::Rr22PsiMode::FastMode ? "Fast" : "LowComm");
-
-  if (mode_ == rr22::Rr22PsiMode::FastMode) {
-    baxos_.Decode(inputs, outputs, b128_span, num_threads);
-
-  } else if (mode_ == rr22::Rr22PsiMode::LowCommMode) {
-    paxos_.Decode(inputs, outputs, b128_span);
-  } else {
-    YACL_THROW("unsupported rr22 psi mode");
-  }
-
-  SPDLOG_INFO("paxos decode finished");
-
-  yacl::parallel_for(0, inputs.size(), [&](int64_t begin, int64_t end) {
-    for (int64_t idx = begin; idx < end; ++idx) {
-      outputs[idx] = outputs[idx] ^ inputs_hash[idx];
-
-      if (malicious_) {
-        outputs[idx] = outputs[idx] ^ w_;
-      }
-    }
-  });
-
-  if (malicious_) {
-    rr22::DavisMeyerHash(outputs, inputs, outputs);
-  } else {
-    rr22::okvs::AesCrHash aes_crhash(kAesHashSeed);
-
-    aes_crhash.Hash(outputs, outputs);
-  }
-}
-
-void Rr22OprfReceiver::Recv(const std::shared_ptr<yacl::link::Context>& lctx,
-                            size_t paxos_init_size,
-                            const std::vector<uint128_t>& inputs,
-                            absl::Span<uint128_t> outputs, size_t num_threads) {
-  if (mode_ == rr22::Rr22PsiMode::FastMode) {
-    RecvFast(lctx, paxos_init_size, inputs, outputs, num_threads);
-  } else if (mode_ == rr22::Rr22PsiMode::LowCommMode) {
-    RecvLowComm(lctx, paxos_init_size, inputs, outputs, num_threads);
-  }
-}
-
-void Rr22OprfReceiver::RecvFast(
-    const std::shared_ptr<yacl::link::Context>& lctx, size_t paxos_init_size,
-    const std::vector<uint128_t>& inputs, absl::Span<uint128_t> outputs,
-    size_t num_threads) {
-  YACL_ENFORCE(inputs.size() <= paxos_init_size);
-
-  rr22::okvs::Baxos paxos;
-
-  uint128_t paxos_seed = yacl::crypto::SecureRandU128();
-
-  yacl::ByteContainerView paxos_seed_buf(&paxos_seed, sizeof(uint128_t));
-
-  lctx->SendAsyncThrottled(lctx->NextRank(), paxos_seed_buf,
-                           fmt::format("send paxos_seed_buf"));
-  paxos.Init(paxos_init_size, bin_size_, kPaxosWeight, ssp_,
-             rr22::okvs::PaxosParam::DenseType::GF128, paxos_seed);
-
-  uint128_t w = 0;
-  uint128_t wr = 0;
-  yacl::Buffer ws_hash_buf;
-  if (malicious_) {
-    SPDLOG_INFO("malicious version");
-    // generate wr
-    wr = yacl::crypto::SecureRandU128();
-    ws_hash_buf = lctx->Recv(lctx->NextRank(), fmt::format("recv ws_hash"));
-    YACL_ENFORCE(ws_hash_buf.size() == 32);
-  }
-
-  paxos_size_ = paxos.size();
-
-  // c + b = a * delta
-  yacl::Buffer a;
-  yacl::Buffer c;
-  absl::Span<uint128_t> a128_span;
-  absl::Span<uint128_t> c128_span;
-
-  // vole recv function
-  auto vole_recv_proc = std::async([&] {
-    yacl::crypto::SilentVoleReceiver vole_receiver(code_type_, malicious_);
-
-    a = yacl::Buffer(std::max<size_t>(256, paxos.size()) * sizeof(uint128_t));
-    c = yacl::Buffer(std::max<size_t>(256, paxos.size()) * sizeof(uint128_t));
-    std::memset(a.data(), 0, a.size());
-    std::memset(c.data(), 0, c.size());
-
-    a128_span = absl::MakeSpan(reinterpret_cast<uint128_t*>(a.data()),
-                               std::max<size_t>(256, paxos.size()));
-    c128_span = absl::MakeSpan(reinterpret_cast<uint128_t*>(c.data()),
-                               std::max<size_t>(256, paxos.size()));
-
-    SPDLOG_INFO("a_,b_ size:{} {}", a128_span.size(), c128_span.size());
-
-    SPDLOG_INFO("begin vole recv");
-
-    vole_receiver.Recv(lctx, a128_span, c128_span);
-
-    SPDLOG_INFO("end vole recv");
-  });
-
-  rr22::okvs::AesCrHash aes_crhash(kAesHashSeed);
-
-  aes_crhash.Hash(absl::MakeSpan(inputs), outputs);
-
-  yacl::Buffer p_buffer(paxos.size() * sizeof(uint128_t));
-  std::memset((uint8_t*)p_buffer.data(), 0, p_buffer.size());
-
-  absl::Span<uint128_t> p128_span =
-      absl::MakeSpan((uint128_t*)(p_buffer.data()), paxos.size());
-
-  SPDLOG_INFO("solve begin");
-  paxos.Solve(absl::MakeSpan(inputs), outputs, p128_span, nullptr, num_threads);
-  SPDLOG_INFO("solve end");
-
-  vole_recv_proc.get();
-
-  if (malicious_) {
-    // send wr
-    lctx->SendAsyncThrottled(lctx->NextRank(),
-                             yacl::ByteContainerView(&wr, sizeof(uint128_t)),
-                             fmt::format("send wr"));
-
-    // recv sender's ws
-    yacl::Buffer ws_buf = lctx->Recv(lctx->NextRank(), fmt::format("recv ws"));
-    YACL_ENFORCE(ws_buf.size() == sizeof(uint128_t));
-
-    yacl::crypto::RandomOracle ro(yacl::crypto::HashAlgorithm::SHA256, 32);
-    yacl::Buffer ws_hash = ro(ws_buf, 32);
-    // check ws and hash of ws
-    // if check failed, aborts protocol
-    YACL_ENFORCE(
-        std::memcmp(ws_hash.data(), ws_hash_buf.data(), ws_hash.size()) == 0,
-        "server seed not match");
-    uint128_t ws;
-    std::memcpy(&ws, ws_buf.data(), sizeof(uint128_t));
-    w = wr ^ ws;
-  }
-
-  auto oprf_eval_proc = std::async([&] {
-    SPDLOG_INFO("begin compute self oprf");
-    paxos.Decode(absl::MakeSpan(inputs), outputs,
-                 absl::MakeSpan(c128_span.data(), paxos.size()), num_threads);
-
-    if (malicious_) {
-      for (size_t i = 0; i < outputs.size(); ++i) {
-        outputs[i] = outputs[i] ^ w;
-      }
-    }
-
-    if (malicious_) {
-      SPDLOG_INFO("call Davis-Meyer hash");
-      rr22::DavisMeyerHash(outputs, inputs, outputs);
-    } else {
-      SPDLOG_INFO("call aes crhash");
-      aes_crhash.Hash(outputs, outputs);
-    }
-    SPDLOG_INFO("end compute self oprf");
-  });
-
-  SPDLOG_INFO("begin p xor a");
-
-  yacl::parallel_for(0, p128_span.size(), [&](int64_t begin, int64_t end) {
-    for (int64_t idx = begin; idx < end; ++idx) {
-      p128_span[idx] = p128_span[idx] ^ a128_span[idx];
-    }
-  });
-
-  SPDLOG_INFO("end p xor a");
-
-  for (size_t i = 0; i < p128_span.size(); i += 100000) {
-    size_t batch_size = std::min<size_t>(100000, p128_span.size() - i);
-    yacl::ByteContainerView paxos_solve_byteview(
-        &p128_span[i], batch_size * sizeof(uint128_t));
-
-    lctx->Send(lctx->NextRank(), paxos_solve_byteview,
-               fmt::format("send paxos_solve_byteview"));
-  }
-
-  oprf_eval_proc.get();
-}
-
-void Rr22OprfReceiver::RecvLowComm(
-    const std::shared_ptr<yacl::link::Context>& lctx, size_t paxos_init_size,
-    const std::vector<uint128_t>& inputs, absl::Span<uint128_t> outputs,
-    [[maybe_unused]] size_t num_threads) {
-  YACL_ENFORCE(inputs.size() <= paxos_init_size);
-
-  rr22::okvs::Paxos<uint32_t> paxos;
-
-  uint128_t paxos_seed = yacl::crypto::SecureRandU128();
-
-  yacl::ByteContainerView paxos_seed_buf(&paxos_seed, sizeof(uint128_t));
-
-  lctx->SendAsyncThrottled(lctx->NextRank(), paxos_seed_buf,
-                           fmt::format("send paxos_seed_buf"));
-  // here we must use DenseType::Binary for supporting EncodeU64, which
-  // should used in LowComm
-  paxos.Init(paxos_init_size, kPaxosWeight, ssp_,
-             rr22::okvs::PaxosParam::DenseType::Binary, paxos_seed);
-
-  paxos_size_ = paxos.size();
-
-  // c + b = a * delta
-  yacl::Buffer a;
-  yacl::Buffer c;
-  absl::Span<uint64_t> a64_span;
-  absl::Span<uint128_t> c128_span;
-
-  // vole recv function
-  auto vole_recv_proc = std::async([&] {
-    SPDLOG_INFO("use SilentVoleReceiver");
-    yacl::crypto::SilentVoleReceiver vole_receiver(code_type_);
-
-    a = yacl::Buffer(std::max<size_t>(256, paxos.size()) * sizeof(uint64_t));
-    memset(a.data(), 0, a.size());
-    c = yacl::Buffer(std::max<size_t>(256, paxos.size()) * sizeof(uint128_t));
-    memset(c.data(), 0, c.size());
-
-    a64_span = absl::MakeSpan(reinterpret_cast<uint64_t*>(a.data()),
-                              std::max<size_t>(256, paxos.size()));
-    c128_span = absl::MakeSpan(reinterpret_cast<uint128_t*>(c.data()),
-                               std::max<size_t>(256, paxos.size()));
-    SPDLOG_INFO("a_,b_ size:{} {} ", a64_span.size(), c128_span.size());
-
-    SPDLOG_INFO("begin vole recv");
-
-    vole_receiver.SfRecv(lctx, absl::MakeSpan(a64_span), c128_span);
-
-    SPDLOG_INFO("end vole recv");
-  });
-
-  rr22::okvs::AesCrHash aes_crhash(kAesHashSeed);
-
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    outputs[i] = aes_crhash.Hash(inputs[i]);
-  }
-
-  // in LowComm version, we need to Hash the inputs to subfield, so we only need
-  // the low 64 bits
-  std::vector<uint64_t> outputs_u64(outputs.size(), 0);
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    outputs_u64[i] = yacl::DecomposeUInt128(outputs[i]).second;
-  }
-  absl::Span<uint64_t> outputs_u64_span(outputs_u64);
-
-  // yacl::Buffer p_buffer(paxos.size() * sizeof(uint128_t));
-  // yacl::Buffer p_buffer(paxos.size() * sizeof(uint64_t));
-  // std::memset(p_buffer.data(), 0, p_buffer.size());
-
-  yacl::Buffer p64_buffer(paxos.size() * sizeof(uint64_t));
-  // yacl::Buffer p_buffer(paxos.size() * sizeof(uint64_t));
-  std::memset(p64_buffer.data(), 0, p64_buffer.size());
-
-  absl::Span<uint64_t> p64_span((uint64_t*)p64_buffer.data(), paxos.size());
-  // absl::Span<uint128_t> p128_span((uint128_t*)p_buffer.data(), paxos.size());
-
-  SPDLOG_INFO("solve begin");
-  paxos.SetInput(absl::MakeSpan(inputs));
-  SPDLOG_INFO("finished SetInput");
-
-  paxos.EncodeU64(outputs_u64_span, p64_span, nullptr);
-
-  SPDLOG_INFO("solve end");
-
-  vole_recv_proc.get();
-
-  auto oprf_eval_proc = std::async([&] {
-    SPDLOG_INFO("begin receiver oprf");
-    paxos.Decode(absl::MakeSpan(inputs), outputs,
-                 absl::MakeSpan(c128_span.data(), paxos.size()));
-    // oprf end output
-    aes_crhash.Hash(outputs, outputs);
-    SPDLOG_INFO("end receiver oprf");
-  });
-
-  SPDLOG_INFO("begin p xor a");
-
-  for (size_t i = 0; i < p64_span.size(); ++i) {
-    p64_span[i] ^= a64_span[i];
-  }
-
-  SPDLOG_INFO("end p xor a");
-
-  yacl::ByteContainerView paxos_solve_byteview(
-      p64_span.data(), p64_span.size() * sizeof(uint64_t));
-
-  lctx->SendAsyncThrottled(lctx->NextRank(), paxos_solve_byteview,
-                           fmt::format("send paxos_solve_byteview"));
-
-  oprf_eval_proc.get();
-}
-
-std::vector<size_t> GetIntersection(
-    absl::Span<const uint128_t> self_oprfs, size_t peer_items_num,
-    const std::shared_ptr<yacl::link::Context>& lctx, size_t num_threads,
-    bool compress, size_t mask_size) {
-  if (!compress) {
-    mask_size = sizeof(uint128_t);
-  }
-  google::dense_hash_map<uint128_t, size_t, NoHash> dense_map(
-      self_oprfs.size());
-  dense_map.set_empty_key(yacl::MakeUint128(0, 0));
-  auto map_f = std::async([&]() {
-    auto truncate_mask = yacl::MakeUint128(0, 0);
-    if (compress) {
-      for (size_t i = 0; i < mask_size; ++i) {
-        truncate_mask = 0xff | (truncate_mask << 8);
-        SPDLOG_DEBUG(
-            "{}, truncate_mask:{}", i,
-            (std::ostringstream() << rr22::okvs::Galois128(truncate_mask))
-                .str());
-      }
-    }
-    for (size_t i = 0; i < self_oprfs.size(); ++i) {
-      if (compress) {
-        dense_map.insert(std::make_pair(self_oprfs[i] & truncate_mask, i));
-      } else {
-        dense_map.insert(std::make_pair(self_oprfs[i], i));
-      }
-    }
-  });
-  SPDLOG_INFO("recv rr22 oprf begin");
-  auto peer_buffer =
-      lctx->Recv(lctx->NextRank(), fmt::format("recv paxos_solve"));
-  YACL_ENFORCE(peer_items_num == peer_buffer.size() / mask_size);
-  SPDLOG_INFO("recv rr22 oprf finished: {} vector:{}", peer_buffer.size(),
-              peer_items_num);
-  map_f.get();
-  auto* peer_data_ptr = peer_buffer.data<uint8_t>();
-  std::mutex merge_mtx;
-  std::vector<size_t> indices;
-  size_t grain_size = (peer_items_num + num_threads - 1) / num_threads;
-  yacl::parallel_for(
-      0, peer_items_num, grain_size, [&](int64_t begin, int64_t end) {
-        std::vector<uint32_t> tmp_indexs;
-        uint128_t data = yacl::MakeUint128(0, 0);
-        for (int64_t j = begin; j < end; j++) {
-          std::memcpy(&data, peer_data_ptr + (j * mask_size), mask_size);
-          auto iter = dense_map.find(data);
-          if (iter != dense_map.end()) {
-            tmp_indexs.push_back(iter->second);
-          }
-        }
-        if (!tmp_indexs.empty()) {
-          std::lock_guard<std::mutex> lock(merge_mtx);
-          indices.insert(indices.end(), tmp_indexs.begin(), tmp_indexs.end());
-        }
-      });
-  return indices;
-}
-
-void Rr22PsiSenderInternal(const rr22::Rr22PsiOptions& options,
-                           const std::shared_ptr<yacl::link::Context>& lctx,
-                           const std::vector<uint128_t>& inputs) {
-  YACL_ENFORCE(lctx->WorldSize() == 2);
-
-  // Gather Items Size
-  std::vector<size_t> items_size = AllGatherItemsSize(lctx, inputs.size());
-
-  size_t sender_size = items_size[lctx->Rank()];
-  size_t receiver_size = items_size[lctx->NextRank()];
-
-  YACL_ENFORCE(sender_size == inputs.size());
-
-  if ((sender_size == 0) || (receiver_size == 0)) {
-    return;
-  }
-  YACL_ENFORCE(sender_size <= receiver_size);
-
-  size_t mask_size = sizeof(uint128_t);
-  if (options.compress) {
-    mask_size = ComputeTruncateSize(sender_size, receiver_size, options.ssp,
-                                    options.malicious);
-  }
-
-  Rr22OprfSender oprf_sender(kRr22OprfBinSize, options.ssp, options.mode,
-                             options.code_type, options.malicious);
-
-  yacl::Buffer inputs_hash_buffer(inputs.size() * sizeof(uint128_t));
-  absl::Span<uint128_t> inputs_hash =
-      absl::MakeSpan((uint128_t*)inputs_hash_buffer.data(), inputs.size());
-
-  oprf_sender.Send(lctx, receiver_size, absl::MakeSpan(inputs), inputs_hash,
-                   options.num_threads);
-
-  yacl::Buffer sender_oprf_buffer(inputs.size() * sizeof(uint128_t));
-  absl::Span<uint128_t> sender_oprfs =
-      absl::MakeSpan((uint128_t*)(sender_oprf_buffer.data()), inputs.size());
-
-  SPDLOG_INFO("oprf eval begin");
-  oprf_sender.Eval(inputs, inputs_hash, sender_oprfs, options.num_threads);
-
-  SPDLOG_INFO("oprf eval end");
-
-  // random shuffle sender's oprf values
-  SPDLOG_INFO("oprf shuffle begin");
-  std::mt19937 g(yacl::crypto::SecureRandU64());
-  std::shuffle(sender_oprfs.begin(), sender_oprfs.end(), g);
-  SPDLOG_INFO("oprf shuffle end");
-
-  yacl::ByteContainerView oprf_byteview;
-  if (options.compress) {
-    uint128_t* src = sender_oprfs.data();
-    uint8_t* dest = (uint8_t*)sender_oprfs.data();
-
-    for (size_t i = 0; i < sender_oprfs.size(); ++i) {
-      std::memmove(dest, src, mask_size);
-      dest += mask_size;
-      src += 1;
-    }
-
-    oprf_byteview = yacl::ByteContainerView(sender_oprfs.data(),
-                                            sender_oprfs.size() * mask_size);
-  } else {
-    oprf_byteview = yacl::ByteContainerView(
-        sender_oprfs.data(), sender_oprfs.size() * sizeof(uint128_t));
-  }
-
-  SPDLOG_INFO("send rr22 oprf: {} vector:{}", oprf_byteview.size(),
-              sender_oprfs.size());
-
-  yacl::ByteContainerView send_byteview = yacl::ByteContainerView(
-      oprf_byteview.data(), sender_oprfs.size() * mask_size);
-  // TODO: split send_byteview then send or may cause recv timeout
-  lctx->SendAsyncThrottled(lctx->NextRank(), send_byteview,
-                           fmt::format("send oprf_buf"));
-
-  SPDLOG_INFO("send rr22 oprf finished");
-}
-
-std::vector<size_t> Rr22PsiReceiverInternal(
-    const rr22::Rr22PsiOptions& options,
-    const std::shared_ptr<yacl::link::Context>& lctx,
-    const std::vector<uint128_t>& inputs) {
-  YACL_ENFORCE(lctx->WorldSize() == 2);
-
-  // Gather Items Size
-  std::vector<size_t> items_size = AllGatherItemsSize(lctx, inputs.size());
-
-  size_t receiver_size = items_size[lctx->Rank()];
-  size_t sender_size = items_size[lctx->NextRank()];
-
-  YACL_ENFORCE(receiver_size == inputs.size());
-
-  if ((sender_size == 0) || (receiver_size == 0)) {
-    return {};
-  }
-  YACL_ENFORCE(sender_size <= receiver_size);
-
-  size_t mask_size = sizeof(uint128_t);
-  if (options.compress) {
-    mask_size = ComputeTruncateSize(sender_size, receiver_size, options.ssp,
-                                    options.malicious);
-  }
-
-  Rr22OprfReceiver oprf_receiver(kRr22OprfBinSize, options.ssp, options.mode,
-                                 options.code_type, options.malicious);
-
-  SPDLOG_INFO("out buffer begin");
-  yacl::Buffer outputs_buffer(inputs.size() * sizeof(uint128_t));
-  absl::Span<uint128_t> outputs =
-      absl::MakeSpan((uint128_t*)(outputs_buffer.data()), inputs.size());
-  SPDLOG_INFO("out buffer end");
-
-  oprf_receiver.Recv(lctx, receiver_size, inputs, outputs, options.num_threads);
-
-  SPDLOG_INFO("compute intersection begin, threads:{}", options.num_threads);
-
-  std::vector<size_t> indices =
-      GetIntersection(outputs, sender_size, lctx, options.num_threads,
-                      options.compress, mask_size);
-
-  SPDLOG_INFO("compute intersection end");
-
-  return indices;
-}
-
-Rr22PsiOperator::Options Rr22PsiOperator::ParseConfig(
-    const MemoryPsiConfig& config,
-    const std::shared_ptr<yacl::link::Context>& lctx) {
-  Options options;
-  options.link_ctx = lctx;
-  options.receiver_rank = config.receiver_rank();
-
-  size_t thread_num = omp_get_num_procs();
-
-  options.rr22_options.ssp = 40;
-  options.rr22_options.num_threads = thread_num;
-  options.rr22_options.compress = true;
-
-  return options;
-}
-
-std::vector<std::string> Rr22PsiOperator::OnRun(
-    const std::vector<std::string>& inputs) {
-  std::vector<std::string> result;
-
-  // Gather Items Size
-  std::vector<size_t> items_size = AllGatherItemsSize(link_ctx_, inputs.size());
-  size_t max_size = std::max(items_size[link_ctx_->Rank()],
-                             items_size[link_ctx_->NextRank()]);
-
-  // hash items to uint128_t
-  std::vector<uint128_t> items_hash(inputs.size());
-
-  SPDLOG_INFO("begin items hash");
-  yacl::parallel_for(0, inputs.size(), [&](int64_t begin, int64_t end) {
-    for (int64_t idx = begin; idx < end; ++idx) {
-      items_hash[idx] = yacl::crypto::Blake3_128(inputs[idx]);
-    }
-  });
-  SPDLOG_INFO("end items hash");
-
-  // padding receiver's input to max_size
-  if ((options_.receiver_rank == link_ctx_->Rank()) &&
-      (inputs.size() < max_size)) {
-    items_hash.resize(max_size);
-    for (size_t idx = inputs.size(); idx < max_size; idx++) {
-      items_hash[idx] = yacl::crypto::SecureRandU128();
-    }
-  }
-
-  const auto psi_core_start = std::chrono::system_clock::now();
-
-  if (options_.receiver_rank == link_ctx_->Rank()) {
-    std::vector<size_t> rr22_psi_result = Rr22PsiReceiverInternal(
-        options_.rr22_options, options_.link_ctx, items_hash);
-
-    const auto psi_core_end = std::chrono::system_clock::now();
-    const DurationMillis psi_core_duration = psi_core_end - psi_core_start;
-    SPDLOG_INFO("rank: {}, psi_core_duration:{}", options_.link_ctx->Rank(),
-                (psi_core_duration.count() / 1000));
-
-    result.reserve(rr22_psi_result.size());
-
-    for (auto index : rr22_psi_result) {
-      result.push_back(inputs[index]);
-    }
-  } else {
-    Rr22PsiSenderInternal(options_.rr22_options, options_.link_ctx, items_hash);
-
-    const auto psi_core_end = std::chrono::system_clock::now();
-    const DurationMillis psi_core_duration = psi_core_end - psi_core_start;
-    SPDLOG_INFO("rank: {}, psi_core_duration:{}", options_.link_ctx->Rank(),
-                (psi_core_duration.count() / 1000));
-  }
-
-  return result;
-}
-
-namespace {
-
-std::unique_ptr<PsiBaseOperator> CreateFastOperator(
-    const MemoryPsiConfig& config,
-    const std::shared_ptr<yacl::link::Context>& lctx) {
-  auto options = Rr22PsiOperator::ParseConfig(config, lctx);
-
-  return std::make_unique<Rr22PsiOperator>(options);
-}
-
-std::unique_ptr<PsiBaseOperator> CreateLowCommOperator(
-    const MemoryPsiConfig& config,
-    const std::shared_ptr<yacl::link::Context>& lctx) {
-  auto options = Rr22PsiOperator::ParseConfig(config, lctx);
-
-  options.rr22_options.mode = rr22::Rr22PsiMode::LowCommMode;
-
-  return std::make_unique<Rr22PsiOperator>(options);
-}
-
-std::unique_ptr<PsiBaseOperator> CreateMaliciousOperator(
-    const MemoryPsiConfig& config,
-    const std::shared_ptr<yacl::link::Context>& lctx) {
-  auto options = Rr22PsiOperator::ParseConfig(config, lctx);
-
-  options.rr22_options.mode = rr22::Rr22PsiMode::FastMode;
-
-  options.rr22_options.malicious = true;
-  options.rr22_options.code_type = yacl::crypto::CodeType::ExAcc7;
-
-  return std::make_unique<Rr22PsiOperator>(options);
-}
-
-REGISTER_OPERATOR(RR22_FAST_PSI_2PC, CreateFastOperator);
-REGISTER_OPERATOR(RR22_LOWCOMM_PSI_2PC, CreateLowCommOperator);
-
-// malicious
-REGISTER_OPERATOR(RR22_MALICIOUS_PSI_2PC, CreateMaliciousOperator);
-
-}  // namespace
-
-}  // namespace psi
diff --git a/psi/legacy/rr22_2party_psi.h b/psi/legacy/rr22_2party_psi.h
deleted file mode 100644
index 749a576..0000000
--- a/psi/legacy/rr22_2party_psi.h
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright 2023 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "psi/legacy/base_operator.h"
-#include "psi/rr22/rr22_psi.h"
-
-namespace psi {
-
-class Rr22PsiOperator : public PsiBaseOperator {
- public:
-  struct Options {
-    std::shared_ptr<yacl::link::Context> link_ctx;
-    size_t receiver_rank = 0;
-    rr22::Rr22PsiOptions rr22_options = rr22::Rr22PsiOptions(40, 0, true);
-  };
-
-  static Options ParseConfig(const MemoryPsiConfig& config,
-                             const std::shared_ptr<yacl::link::Context>& lctx);
-
-  explicit Rr22PsiOperator(const Options& options)
-      : PsiBaseOperator(options.link_ctx), options_(options) {}
-
-  std::vector<std::string> OnRun(const std::vector<std::string>& inputs) final;
-
- private:
-  Options options_;
-};
-
-class Rr22OprfSender : public rr22::Rr22Oprf {
- public:
-  Rr22OprfSender(
-      size_t bin_size, size_t ssp,
-      rr22::Rr22PsiMode mode = rr22::Rr22PsiMode::FastMode,
-      const yacl::crypto::CodeType& code_type = yacl::crypto::CodeType::ExAcc7,
-      bool malicious = false)
-      : Rr22Oprf(bin_size, ssp, mode, code_type, malicious) {
-    if (malicious && mode == rr22::Rr22PsiMode::LowCommMode) {
-      YACL_THROW("RR22 malicious psi not support LowCommMode");
-    }
-  }
-
-  void Send(const std::shared_ptr<yacl::link::Context>& lctx, size_t n,
-            absl::Span<const uint128_t> inputs,
-            absl::Span<uint128_t> hash_outputs, size_t num_threads);
-
-  void SendFast(const std::shared_ptr<yacl::link::Context>& lctx, size_t n,
-                absl::Span<const uint128_t> inputs,
-                absl::Span<uint128_t> hash_outputs, size_t num_threads);
-
-  void SendLowComm(const std::shared_ptr<yacl::link::Context>& lctx, size_t n,
-                   absl::Span<const uint128_t> inputs,
-                   absl::Span<uint128_t> hash_outputs, size_t num_threads);
-
-  void HashInputMulDelta(absl::Span<const uint128_t> inputs,
-                         absl::Span<uint128_t> hash_outputs);
-
-  void Eval(absl::Span<const uint128_t> inputs, absl::Span<uint128_t> outputs,
-            uint64_t num_threads = 0);
-
-  void Eval(absl::Span<const uint128_t> inputs,
-            absl::Span<const uint128_t> inputs_hash,
-            absl::Span<uint128_t> outputs, uint64_t num_threads = 0);
-
- private:
-  rr22::okvs::Baxos baxos_;
-  rr22::okvs::Paxos<uint32_t> paxos_;
-
-  // b = delta * a + c
-  uint128_t delta_ = 0;
-  yacl::Buffer b_;
-};
-
-class Rr22OprfReceiver : public rr22::Rr22Oprf {
- public:
-  Rr22OprfReceiver(
-      size_t bin_size, size_t ssp,
-      rr22::Rr22PsiMode mode = rr22::Rr22PsiMode::FastMode,
-      const yacl::crypto::CodeType& code_type = yacl::crypto::CodeType::ExAcc7,
-      bool malicious = false)
-      : Rr22Oprf(bin_size, ssp, mode, code_type, malicious) {
-    if (malicious && mode == rr22::Rr22PsiMode::LowCommMode) {
-      YACL_THROW("RR22 malicious psi not support LowCommMode");
-    }
-  }
-
-  void Recv(const std::shared_ptr<yacl::link::Context>& lctx,
-            size_t paxos_init_size, const std::vector<uint128_t>& inputs,
-            absl::Span<uint128_t> outputs, size_t num_threads);
-
-  void RecvFast(const std::shared_ptr<yacl::link::Context>& lctx,
-                size_t paxos_init_size, const std::vector<uint128_t>& inputs,
-                absl::Span<uint128_t> outputs, size_t num_threads);
-
-  void RecvLowComm(const std::shared_ptr<yacl::link::Context>& lctx,
-                   size_t paxos_init_size, const std::vector<uint128_t>& inputs,
-                   absl::Span<uint128_t> outputs, size_t num_threads);
-
- private:
-};
-
-}  // namespace psi
diff --git a/psi/proto/psi.proto b/psi/proto/psi.proto
index 124df70..e2a6904 100644
--- a/psi/proto/psi.proto
+++ b/psi/proto/psi.proto
@@ -18,15 +18,21 @@ syntax = "proto3";
 
 package psi;
 
+// **Deprecation notice**
+// This enum is scheduled for removal in a future release.
+// Use psi.v2.ProtocolConfig instead.
+//
 // The algorithm type of psi.
 enum PsiType {
   reserved 3;
 
   INVALID_PSI_TYPE = 0;
 
+  // **NOTICED**: No longer supported
   // DDH based PSI
   ECDH_PSI_2PC = 1;
 
+  // **NOTICED**: No longer supported
   // Efficient Batched Oblivious PRF with Applications to Private Set
   // Intersection https://eprint.iacr.org/2016/799.pdf
   KKRT_PSI_2PC = 2;
@@ -43,39 +49,37 @@ enum PsiType {
   // Notice: two-party intersection leak
   KKRT_PSI_NPC = 6;
 
+  // **NOTICED**: No longer supported
   // ecdh-oprf 2-party Unbalanced-PSI Generate CACHE.
-  // Not support, please use new interface UbPsiConfig in psi_v2.proto.
   ECDH_OPRF_UB_PSI_2PC_GEN_CACHE = 7;
 
+  // **NOTICED**: No longer supported
   // ecdh-oprf 2-party Unbalanced-PSI transfer CACHE.
-  // Not support, please use new interface UbPsiConfig in psi_v2.proto.
   ECDH_OPRF_UB_PSI_2PC_TRANSFER_CACHE = 8;
 
+  // **NOTICED**: No longer supported
   // ecdh-oprf 2-party Unbalanced-PSI offline phase.
-  // Not support, please use new interface UbPsiConfig in psi_v2.proto.
   ECDH_OPRF_UB_PSI_2PC_OFFLINE = 9;
 
+  // **NOTICED**: No longer supported
   // ecdh-oprf 2-party Unbalanced-PSI online phase.
-  // Not support, please use new interface UbPsiConfig in psi_v2.proto.
   ECDH_OPRF_UB_PSI_2PC_ONLINE = 10;
 
+  // **NOTICED**: No longer supported
   // ecdh-oprf 2-party Unbalanced-PSI with shuffling online phase.
   // large set party get intersection result
-  // Not support, please use new interface UbPsiConfig in psi_v2.proto.
   ECDH_OPRF_UB_PSI_2PC_SHUFFLE_ONLINE = 11;
 
   // Differentially-Private PSI https://arxiv.org/pdf/2208.13249.pdf
   // bases on ECDH-PSI, and provides: Differentially private PSI results.
   DP_PSI_2PC = 12;
 
+  // **NOTICED**: No longer supported
   // Blazing Fast PSI https://eprint.iacr.org/2022/320.pdf
   // two mode: fast mode or low communication mode
   RR22_FAST_PSI_2PC = 13;
   RR22_LOWCOMM_PSI_2PC = 14;
   RR22_MALICIOUS_PSI_2PC = 15;
-
-  // KMPRT17 PSI https://eprint.iacr.org/2017/799.pdf
-  KMPRT17_MP_PSI = 16;
 }
 
 // The specified elliptic curve cryptography used in psi.
@@ -102,6 +106,9 @@ enum CurveType {
   // CURVE_RISTRETTO255 = 5;
 }
 
+// **Deprecation notice**
+// This message is scheduled for removal in a future release.
+//
 // The input parameters of psi.
 message InputParams {
   // The path of input csv file.
@@ -112,6 +119,9 @@ message InputParams {
   bool precheck = 3;
 }
 
+// **Deprecation notice**.
+// This message is scheduled for removal in a future release.
+//
 // The output parameters of psi.
 message OutputParams {
   // The path of output csv file.
@@ -141,6 +151,10 @@ message DpPsiParams {
   double epsilon = 2;
 }
 
+// **Deprecation notice**.
+// This message is scheduled for removal in a future release.
+// Use psi.v2.PsiConfig instead.
+//
 // The Bucket-psi configuration.
 //
 // ```python
@@ -196,6 +210,9 @@ message BucketPsiConfig {
   DpPsiParams dppsi_params = 10;
 }
 
+// **Deprecation notice**.
+// This message is scheduled for removal in a future release.
+//
 // The In-memory psi configuration.
 //
 // ```python
@@ -233,4 +250,4 @@ message MemoryPsiConfig {
 
   // Optional，Params for dp-psi
   DpPsiParams dppsi_params = 5;
-}
\ No newline at end of file
+}
diff --git a/psi/utils/BUILD.bazel b/psi/utils/BUILD.bazel
index c9ca51f..b231f54 100644
--- a/psi/utils/BUILD.bazel
+++ b/psi/utils/BUILD.bazel
@@ -46,7 +46,7 @@ psi_cc_library(
         ":arrow_csv_batch_provider",
         ":multiplex_disk_cache",
         ":random_str",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@yacl//yacl/base:int128",
     ],
 )
@@ -120,8 +120,8 @@ psi_cc_library(
         ":io",
         ":key",
         ":random_str",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
+        "@abseil-cpp//absl/strings",
+        "@abseil-cpp//absl/time",
         "@org_apache_arrow//:arrow",
         "@yacl//yacl/base:exception",
         "@yacl//yacl/crypto/hash:hash_utils",
@@ -168,14 +168,6 @@ cc_proto_library(
     deps = [":ub_psi_cache_proto"],
 )
 
-psi_cc_library(
-    name = "csv_header_analyzer",
-    hdrs = ["csv_header_analyzer.h"],
-    deps = [
-        "@yacl//yacl/base:exception",
-    ],
-)
-
 psi_cc_test(
     name = "csv_checker_test",
     srcs = ["csv_checker_test.cc"],
@@ -211,10 +203,9 @@ psi_cc_library(
     hdrs = ["batch_provider_impl.h"],
     deps = [
         ":arrow_csv_batch_provider",
-        ":csv_header_analyzer",
         ":io",
         ":key",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@yacl//yacl/base:exception",
         "@yacl//yacl/crypto/rand",
     ],
@@ -226,7 +217,7 @@ psi_cc_library(
     hdrs = ["resource.h"],
     deps = [
         ":hash_bucket_cache",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@yacl//yacl/base:exception",
     ],
 )
@@ -252,27 +243,6 @@ psi_cc_library(
     ],
 )
 
-psi_cc_library(
-    name = "emp_io_adapter",
-    srcs = ["emp_io_adapter.cc"],
-    hdrs = [
-        "emp_io_adapter.h",
-    ],
-    deps = [
-        "@com_github_emptoolkit_emp_tool//:emp-tool",
-        "@yacl//yacl/link",
-    ],
-)
-
-psi_cc_test(
-    name = "emp_io_adapter_test",
-    srcs = ["emp_io_adapter_test.cc"],
-    copts = AES_COPT_FLAGS,
-    deps = [
-        "emp_io_adapter",
-    ],
-)
-
 psi_cc_library(
     name = "ub_psi_cache",
     srcs = ["ub_psi_cache.cc"],
@@ -293,7 +263,7 @@ psi_cc_test(
     srcs = ["ub_psi_cache_test.cc"],
     deps = [
         ":ub_psi_cache",
-        "@com_google_absl//absl/time",
+        "@abseil-cpp//absl/time",
         "@yacl//yacl/crypto/rand",
         "@yacl//yacl/utils:scope_guard",
     ],
@@ -304,7 +274,7 @@ psi_cc_library(
     srcs = ["progress.cc"],
     hdrs = ["progress.h"],
     deps = [
-        "@com_github_fmtlib_fmt//:fmtlib",
+        "@fmt",
     ],
 )
 
@@ -312,7 +282,7 @@ psi_cc_library(
     name = "random_str",
     hdrs = ["random_str.h"],
     deps = [
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@yacl//yacl/crypto/rand",
     ],
 )
@@ -365,24 +335,6 @@ psi_cc_test(
     ],
 )
 
-psi_cc_library(
-    name = "csv_header_parser",
-    srcs = ["csv_header_parser.cc"],
-    hdrs = ["csv_header_parser.h"],
-    deps = [
-        "@org_apache_arrow//:arrow",
-        "@yacl//yacl/base:exception",
-    ],
-)
-
-psi_cc_test(
-    name = "csv_header_parser_test",
-    srcs = ["csv_header_parser_test.cc"],
-    deps = [
-        ":csv_header_parser",
-    ],
-)
-
 psi_cc_library(
     name = "recovery",
     srcs = [
@@ -432,7 +384,7 @@ psi_cc_library(
         "key.h",
     ],
     deps = [
-        ":csv_header_parser",
+        ":arrow_helper",
         ":io",
         "@yacl//yacl/base:exception",
     ],
@@ -493,7 +445,7 @@ psi_cc_library(
     hdrs = ["cuckoo_index.h"],
     linkopts = ["-lm"],
     deps = [
-        "@com_google_absl//absl/types:span",
+        "@abseil-cpp//absl/types:span",
         "@yacl//yacl/base:exception",
         "@yacl//yacl/base:int128",
     ],
diff --git a/psi/utils/arrow_csv_batch_provider.cc b/psi/utils/arrow_csv_batch_provider.cc
index 93a5ba4..f7d9023 100644
--- a/psi/utils/arrow_csv_batch_provider.cc
+++ b/psi/utils/arrow_csv_batch_provider.cc
@@ -16,6 +16,8 @@
 
 #include <cassert>
 #include <filesystem>
+#include <future>
+#include <mutex>
 
 #include "arrow/array.h"
 #include "arrow/compute/api.h"
@@ -38,61 +40,82 @@ ArrowCsvBatchProvider::ArrowCsvBatchProvider(
 }
 
 std::vector<std::string> ArrowCsvBatchProvider::ReadNextBatch() {
-  std::vector<std::string> res;
-
-  ReadNextBatch(&res);
-
-  return res;
+  auto buffer = GetBuffer();
+  return buffer.first;
 }
 
 std::pair<std::vector<std::string>, std::vector<std::string>>
 ArrowCsvBatchProvider::ReadNextLabeledBatch() {
-  std::vector<std::string> read_keys;
-  std::vector<std::string> read_labels;
+  auto buffer = GetBuffer();
+  return buffer;
+}
 
-  ReadNextBatch(&read_keys, &read_labels);
+std::pair<std::vector<std::string>, std::vector<std::string>>
+ArrowCsvBatchProvider::GetBuffer() {
+  YACL_ENFORCE(buffer_future_.valid(), "buffer is not valid.");
+  auto ret = buffer_future_.get();
+  ReadAsync();
+  return ret;
+}
 
-  return std::make_pair(read_keys, read_labels);
+ArrowCsvBatchProvider::~ArrowCsvBatchProvider() {
+  if (buffer_future_.valid()) {
+    buffer_future_.wait();
+  }
 }
 
-void ArrowCsvBatchProvider::ReadNextBatch(
-    std::vector<std::string>* read_keys,
-    std::vector<std::string>* read_labels) {
-  assert(read_keys);
+void ArrowCsvBatchProvider::ReadAsync() {
+  buffer_future_ = std::async([this]() {
+    std::vector<std::string> keys;
+    keys.reserve(batch_size_);
+    std::vector<std::string> labels;
+    labels.reserve(batch_size_);
 
-  while (read_keys->size() < batch_size_) {
-    bool new_batch = false;
+    ReadBatch(&keys, &labels);
 
-    if (!batch_ || idx_in_batch_ >= batch_->num_rows()) {
-      arrow::Status status = reader_->ReadNext(&batch_);
-      if (!status.ok()) {
-        YACL_THROW("Read csv error.");
-      }
+    return std::make_pair(keys, labels);
+  });
+}
 
-      new_batch = true;
-    }
+void ArrowCsvBatchProvider::ReadBatch(std::vector<std::string>* read_keys,
+                                      std::vector<std::string>* read_labels) {
+  std::lock_guard<std::mutex> lock(buffer_mutex_);
 
-    if (!batch_) {
-      SPDLOG_INFO("Reach the end of csv file {}.", file_path_);
-      return;
-    }
+  YACL_ENFORCE(read_keys != nullptr, "read_keys is nullptr.");
 
-    if (new_batch) {
-      idx_in_batch_ = 0;
+  if (reach_eof_) {
+    return;
+  }
 
+  while (read_keys->size() < batch_size_) {
+    if (!buffer_batch_ || idx_in_batch_ >= buffer_batch_->num_rows()) {
+      auto status = reader_->ReadNext(&buffer_batch_);
+      YACL_ENFORCE(status.ok(), "Read csv file {} error.", file_path_);
+
+      if (buffer_batch_ == nullptr) {
+        SPDLOG_INFO("Reach the end of csv file {}.", file_path_);
+        reach_eof_ = true;
+        return;
+      }
+      idx_in_batch_ = 0;
       arrays_.clear();
 
-      for (const auto& col : batch_->columns()) {
-        arrays_.emplace_back(
-            std::dynamic_pointer_cast<arrow::StringArray>(col));
+      for (const auto& col : buffer_batch_->columns()) {
+        auto str_col = std::dynamic_pointer_cast<arrow::StringArray>(col);
+        YACL_ENFORCE(str_col != nullptr, "column type is {}, not string type.",
+                     col->type()->ToString());
+        arrays_.emplace_back(str_col);
       }
     }
 
-    for (;
-         idx_in_batch_ < batch_->num_rows() && read_keys->size() < batch_size_;
-         idx_in_batch_++) {
+    auto read_cnt = std::min(
+        batch_size_ - read_keys->size(),
+        static_cast<size_t>(buffer_batch_->num_rows() - idx_in_batch_));
+
+    for (size_t row = 0; row != read_cnt; ++row, ++idx_in_batch_) {
       {
         std::vector<absl::string_view> values;
+        values.reserve(keys_.size());
         for (size_t i = 0; i < keys_.size(); i++) {
           values.emplace_back(arrays_[i]->Value(idx_in_batch_));
         }
@@ -100,8 +123,10 @@ void ArrowCsvBatchProvider::ReadNextBatch(
         read_keys->emplace_back(KeysJoin(values));
       }
 
-      if (read_labels) {
+      if (read_labels != nullptr && !labels_.empty()) {
         std::vector<absl::string_view> values;
+        values.reserve(labels_.size());
+
         for (size_t i = keys_.size(); i < arrays_.size(); i++) {
           values.emplace_back(arrays_[i]->Value(idx_in_batch_));
         }
@@ -131,9 +156,11 @@ void ArrowCsvBatchProvider::Init() {
   for (const auto& key : keys_) {
     convert_options.column_types[key] = arrow::utf8();
   }
+
   for (const auto& label : labels_) {
     convert_options.column_types[label] = arrow::utf8();
   }
+
   convert_options.include_columns = keys_;
   convert_options.include_columns.insert(convert_options.include_columns.end(),
                                          labels_.begin(), labels_.end());
@@ -141,6 +168,8 @@ void ArrowCsvBatchProvider::Init() {
   reader_ = arrow::csv::StreamingReader::Make(io_context, infile_, read_options,
                                               parse_options, convert_options)
                 .ValueOrDie();
+
+  ReadAsync();
 }
 
 }  // namespace psi
diff --git a/psi/utils/arrow_csv_batch_provider.h b/psi/utils/arrow_csv_batch_provider.h
index 56c80a5..90ec534 100644
--- a/psi/utils/arrow_csv_batch_provider.h
+++ b/psi/utils/arrow_csv_batch_provider.h
@@ -15,6 +15,8 @@
 #pragma once
 
 #include <cstddef>
+#include <future>
+#include <mutex>
 #include <string>
 
 #include "arrow/csv/api.h"
@@ -32,6 +34,8 @@ class ArrowCsvBatchProvider : public IBasicBatchProvider,
                                  size_t batch_size = 1 << 20,
                                  const std::vector<std::string>& labels = {});
 
+  ~ArrowCsvBatchProvider() override;
+
   std::vector<std::string> ReadNextBatch() override;
 
   std::pair<std::vector<std::string>, std::vector<std::string>>
@@ -44,8 +48,12 @@ class ArrowCsvBatchProvider : public IBasicBatchProvider,
  private:
   void Init();
 
-  void ReadNextBatch(std::vector<std::string>* read_keys,
-                     std::vector<std::string>* read_labels = nullptr);
+  void ReadBatch(std::vector<std::string>* read_keys,
+                 std::vector<std::string>* read_labels = nullptr);
+
+  void ReadAsync();
+
+  std::pair<std::vector<std::string>, std::vector<std::string>> GetBuffer();
 
   const size_t batch_size_;
 
@@ -55,16 +63,18 @@ class ArrowCsvBatchProvider : public IBasicBatchProvider,
 
   const std::vector<std::string> labels_;
 
-  size_t row_cnt_ = 0;
-
   std::shared_ptr<arrow::io::ReadableFile> infile_;
 
   std::shared_ptr<arrow::csv::StreamingReader> reader_;
 
-  std::shared_ptr<arrow::RecordBatch> batch_;
+  std::future<std::pair<std::vector<std::string>, std::vector<std::string>>>
+      buffer_future_;
 
+  std::mutex buffer_mutex_;
+  bool reach_eof_ = false;
   int64_t idx_in_batch_ = 0;
-
+  std::shared_ptr<arrow::RecordBatch> buffer_batch_;
+  size_t row_cnt_ = 0;
   std::vector<std::shared_ptr<arrow::StringArray>> arrays_;
 };
 
diff --git a/psi/utils/batch_provider_impl.cc b/psi/utils/batch_provider_impl.cc
index 92881bc..368c09c 100644
--- a/psi/utils/batch_provider_impl.cc
+++ b/psi/utils/batch_provider_impl.cc
@@ -131,78 +131,6 @@ const std::vector<size_t>& MemoryBatchProvider::shuffled_indices() const {
   }
 }
 
-CsvBatchProvider::CsvBatchProvider(const std::string& path,
-                                   const std::vector<std::string>& item_fields,
-                                   size_t batch_size,
-                                   const std::vector<std::string>& label_fields)
-    : batch_size_(batch_size), path_(path), item_analyzer_(path, item_fields) {
-  in_ = io::BuildInputStream(io::FileIoOptions(path_));
-  // skip header
-  std::string line;
-  in_->GetLine(&line);
-
-  if (!label_fields.empty()) {
-    label_analyzer_ = std::make_unique<CsvHeaderAnalyzer>(path, label_fields);
-  }
-}
-
-std::vector<std::string> CsvBatchProvider::ReadNextBatch() {
-  std::vector<std::string> ret;
-  std::string line;
-  while (in_->GetLine(&line)) {
-    std::vector<absl::string_view> tokens = absl::StrSplit(line, ',');
-    std::vector<absl::string_view> targets;
-    for (size_t fidx : item_analyzer_.target_indices()) {
-      YACL_ENFORCE(fidx < tokens.size(),
-                   "Illegal line due to no field at index={}, line={}", fidx,
-                   line);
-      targets.push_back(absl::StripAsciiWhitespace(tokens[fidx]));
-    }
-    ret.push_back(KeysJoin(targets));
-    if (ret.size() == batch_size_) {
-      break;
-    }
-  }
-  return ret;
-}
-
-std::pair<std::vector<std::string>, std::vector<std::string>>
-CsvBatchProvider::ReadNextLabeledBatch() {
-  if (!label_analyzer_) {
-    YACL_THROW("unsupported.");
-  }
-
-  std::pair<std::vector<std::string>, std::vector<std::string>> ret;
-  std::string line;
-
-  while (in_->GetLine(&line)) {
-    std::vector<absl::string_view> tokens = absl::StrSplit(line, ',');
-    std::vector<absl::string_view> items;
-    std::vector<absl::string_view> labels;
-    for (size_t fidx : item_analyzer_.target_indices()) {
-      YACL_ENFORCE(fidx < tokens.size(),
-                   "Illegal line due to no field at index={}, line={}", fidx,
-                   line);
-      items.push_back(absl::StripAsciiWhitespace(tokens[fidx]));
-    }
-    for (size_t fidx : label_analyzer_->target_indices()) {
-      YACL_ENFORCE(fidx < tokens.size(),
-                   "Illegal line due to no field at index={}, line={}", fidx,
-                   line);
-      labels.push_back(absl::StripAsciiWhitespace(tokens[fidx]));
-    }
-
-    ret.first.push_back(KeysJoin(items));
-    ret.second.push_back(KeysJoin(labels));
-
-    if (ret.first.size() == batch_size_) {
-      break;
-    }
-  }
-
-  return ret;
-}
-
 SimpleShuffledBatchProvider::SimpleShuffledBatchProvider(
     const std::string& path, const std::vector<std::string>& target_fields,
     size_t batch_size)
diff --git a/psi/utils/batch_provider_impl.h b/psi/utils/batch_provider_impl.h
index cf4fb32..cea3894 100644
--- a/psi/utils/batch_provider_impl.h
+++ b/psi/utils/batch_provider_impl.h
@@ -27,7 +27,6 @@
 #include <vector>
 
 #include "psi/utils/batch_provider.h"
-#include "psi/utils/csv_header_analyzer.h"
 #include "psi/utils/io.h"
 
 namespace psi {
@@ -63,29 +62,6 @@ class MemoryBatchProvider : public IBasicBatchProvider,
   size_t cursor_index_ = 0;
 };
 
-class CsvBatchProvider : public IBasicBatchProvider,
-                         public ILabeledBatchProvider {
- public:
-  CsvBatchProvider(const std::string& path,
-                   const std::vector<std::string>& item_fields,
-                   size_t batch_size,
-                   const std::vector<std::string>& label_fields = {});
-
-  std::vector<std::string> ReadNextBatch() override;
-
-  std::pair<std::vector<std::string>, std::vector<std::string>>
-  ReadNextLabeledBatch() override;
-
-  [[nodiscard]] size_t batch_size() const override { return batch_size_; }
-
- private:
-  const size_t batch_size_;
-  const std::string path_;
-  std::unique_ptr<io::InputStream> in_;
-  CsvHeaderAnalyzer item_analyzer_;
-  std::unique_ptr<CsvHeaderAnalyzer> label_analyzer_;
-};
-
 // NOTE(junfeng):
 // SimpleShuffledBatchProvider consists a IBasicBatchProvider to provide data
 // and two buffers to speed-up reading.
diff --git a/psi/utils/bucket.h b/psi/utils/bucket.h
index d07f057..ea46943 100644
--- a/psi/utils/bucket.h
+++ b/psi/utils/bucket.h
@@ -42,6 +42,7 @@ void HandleBucketResultByReceiver(
     bool broadcast_result, const std::shared_ptr<yacl::link::Context>& lctx,
     const std::vector<HashBucketCache::BucketItem>& result_list,
     IndexWriter* writer);
+
 void HandleBucketResultByReceiver(
     bool broadcast_result, const std::shared_ptr<yacl::link::Context>& lctx,
     const std::vector<HashBucketCache::BucketItem>& result_list,
diff --git a/psi/utils/csv_header_analyzer.h b/psi/utils/csv_header_analyzer.h
deleted file mode 100644
index 830f9df..0000000
--- a/psi/utils/csv_header_analyzer.h
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright 2022 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <fstream>
-#include <map>
-#include <set>
-#include <vector>
-
-#include "absl/strings/ascii.h"
-#include "absl/strings/str_split.h"
-#include "fmt/ranges.h"
-#include "yacl/base/exception.h"
-
-namespace psi {
-
-// TODO(junfeng): rm this class and replace usage with CsvHeaderParser.
-class CsvHeaderAnalyzer {
- public:
-  CsvHeaderAnalyzer(const std::string& path,
-                    const std::vector<std::string>& target_fields) {
-    std::set<std::string> target_set = CheckAndNormalizeTokens(target_fields);
-
-    std::ifstream in(path);
-    YACL_ENFORCE(in.is_open(), "Cannot open {}", path);
-    std::string line;
-    YACL_ENFORCE(std::getline(in, line), "Cannot read header line in {}", path);
-    YACL_ENFORCE(!CheckIfBOMExists(line),
-                 "The file {} starts with BOM(Byte Order Mark).", path);
-
-    std::vector<std::string> headers = GetCsvTokens(line);
-    std::map<std::string, size_t> col_index_map;
-    size_t idx = 0;
-    for (const std::string& header : headers) {
-      headers_.push_back(header);
-      col_index_map[header] = idx;
-      idx++;
-    }
-    // Iterate by sorted order.
-    for (const auto& target : target_set) {
-      YACL_ENFORCE(col_index_map.find(target) != col_index_map.end(),
-                   "Cannot find feature name='{}' in CSV file header='{}'",
-                   target, line);
-      target_indices_sorted_.push_back(col_index_map[target]);
-    }
-    // Iterate by target_fields sequence.
-    for (std::string target : target_fields) {
-      absl::StripAsciiWhitespace(&target);
-      YACL_ENFORCE(col_index_map.find(target) != col_index_map.end(),
-                   "Cannot find feature name='{}' in CSV file header='{}'",
-                   target, line);
-      target_indices_.push_back(col_index_map[target]);
-    }
-    headers_set_ = CheckAndNormalizeTokens(headers_);
-    header_line_ = line;
-  }
-
-  // Return fields list in this csv. The sequences are same as the file header.
-  const std::vector<std::string>& headers() const { return headers_; }
-
-  // Return sorted fields set in this csv.
-  const std::set<std::string>& headers_set() const { return headers_set_; }
-
-  // Return interested fields indices. The indices are stored by sorted target
-  // field names order.
-  const std::vector<size_t>& target_indices_sorted() const {
-    return target_indices_sorted_;
-  }
-
-  // Return interested fields indices.
-  const std::vector<size_t>& target_indices() const { return target_indices_; }
-
-  const std::string& header_line() const { return header_line_; }
-
-  static std::set<std::string> CheckAndNormalizeTokens(
-      const std::vector<std::string>& inputs) {
-    std::set<std::string> ret;
-    for (std::string input : inputs) {
-      absl::StripAsciiWhitespace(&input);
-      YACL_ENFORCE(!input.empty(),
-                   "Found empty feature name, input feature names='{}'",
-                   fmt::join(inputs, ","));
-      ret.insert(input);
-    }
-    YACL_ENFORCE(ret.size() == inputs.size(), "Repeated feature name in ='{}'",
-                 fmt::join(inputs, ","));
-    return ret;
-  }
-
-  static std::vector<std::string> GetCsvTokens(const std::string& line) {
-    std::vector<std::string> headers = absl::StrSplit(line, ',');
-    std::for_each(headers.begin(), headers.end(),
-                  [](auto& header) { absl::StripAsciiWhitespace(&header); });
-    return headers;
-  }
-
-  // Check if the first line starts with BOM(Byte Order Mark).
-  static bool CheckIfBOMExists(const std::string& first_line) {
-    // Only detect UTF-8 BOM right now.
-    if (first_line.length() >= 3 && first_line[0] == '\xEF' &&
-        first_line[1] == '\xBB' && first_line[2] == '\xBF') {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
- private:
-  std::set<std::string> headers_set_;
-  std::vector<std::string> headers_;
-  // The indices are stored by sorted target field names.
-  std::vector<size_t> target_indices_sorted_;
-  std::vector<size_t> target_indices_;
-  std::string header_line_;
-};
-
-}  // namespace psi
\ No newline at end of file
diff --git a/psi/utils/csv_header_parser.cc b/psi/utils/csv_header_parser.cc
deleted file mode 100644
index 3c1b021..0000000
--- a/psi/utils/csv_header_parser.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright 2023 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "psi/utils/csv_header_parser.h"
-
-#include <cstddef>
-#include <filesystem>
-
-#include "arrow/api.h"
-#include "arrow/csv/api.h"
-#include "arrow/io/api.h"
-#include "yacl/base/exception.h"
-
-namespace psi {
-
-CsvHeaderParser::CsvHeaderParser(const std::string& path) : path_(path) {
-  YACL_ENFORCE(std::filesystem::exists(path_), "Input file {} doesn't exist.",
-               path_);
-
-  arrow::io::IOContext io_context = arrow::io::default_io_context();
-  std::shared_ptr<arrow::io::ReadableFile> infile;
-  infile = arrow::io::ReadableFile::Open(path_, arrow::default_memory_pool())
-               .ValueOrDie();
-  auto read_options = arrow::csv::ReadOptions::Defaults();
-  auto parse_options = arrow::csv::ParseOptions::Defaults();
-  auto convert_options = arrow::csv::ConvertOptions::Defaults();
-  auto reader =
-      arrow::csv::StreamingReader::Make(io_context, infile, read_options,
-                                        parse_options, convert_options)
-          .ValueOrDie();
-
-  const std::shared_ptr<arrow::Schema>& schema = reader->schema();
-
-  for (int i = 0; i < schema->num_fields(); i++) {
-    key_index_map_[schema->field(i)->name()] = i;
-  }
-
-  if (!infile->Close().ok()) {
-    YACL_THROW("Infile {} close failed.", path_);
-  }
-}
-
-std::vector<size_t> CsvHeaderParser::target_indices(
-    const std::vector<std::string>& target_fields, size_t offset) const {
-  std::vector<size_t> indices;
-
-  for (const std::string& key : target_fields) {
-    if (key_index_map_.find(key) == key_index_map_.end()) {
-      YACL_THROW("key {} is not found in {}", key, path_);
-    }
-
-    indices.emplace_back(key_index_map_.at(key) + offset);
-  }
-
-  return indices;
-}
-
-}  // namespace psi
diff --git a/psi/utils/csv_header_parser.h b/psi/utils/csv_header_parser.h
deleted file mode 100644
index 30e3deb..0000000
--- a/psi/utils/csv_header_parser.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright 2023 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstddef>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace psi {
-
-// Just another version of CsvHeaderAnalyzer based on Apache Arrow.
-class CsvHeaderParser {
- public:
-  explicit CsvHeaderParser(const std::string& path);
-
-  // Return interested fields indices.
-  [[nodiscard]] std::vector<size_t> target_indices(
-      const std::vector<std::string>& target_fields, size_t offset = 0) const;
-
- private:
-  std::string path_;
-
-  std::unordered_map<std::string, size_t> key_index_map_;
-};
-
-}  // namespace psi
diff --git a/psi/utils/csv_header_parser_test.cc b/psi/utils/csv_header_parser_test.cc
deleted file mode 100644
index 181509a..0000000
--- a/psi/utils/csv_header_parser_test.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2023 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "psi/utils/csv_header_parser.h"
-
-#include <cstddef>
-#include <filesystem>
-#include <fstream>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-namespace psi {
-
-constexpr auto csv_content = R"csv(id1,id2,y1
-1,"b","y1_1"
-1,"b","y1_2"
-3,"c","y1_3"
-4,"b","y1_4"
-2,"a","y1_5"
-2,"a","y1_6"
-)csv";
-
-TEST(CsvHeaderParserTest, Works) {
-  std::filesystem::path csv_path =
-      std::filesystem::temp_directory_path() / "csv_header_parser_test.csv";
-
-  {
-    std::ofstream file;
-    file.open(csv_path);
-    file << csv_content;
-    file.close();
-  }
-
-  CsvHeaderParser parser(csv_path);
-  EXPECT_EQ(parser.target_indices(std::vector<std::string>{"id2", "id1", "y1"}),
-            (std::vector<size_t>{1, 0, 2}));
-
-  EXPECT_EQ(parser.target_indices(
-                std::vector<std::string>{"y1", "id2", "id2", "id1"}, 1),
-            (std::vector<size_t>{3, 2, 2, 1}));
-
-  { std::filesystem::remove(csv_path); }
-}
-
-}  // namespace psi
diff --git a/psi/utils/ec_point_store.h b/psi/utils/ec_point_store.h
index d9f9231..8838058 100644
--- a/psi/utils/ec_point_store.h
+++ b/psi/utils/ec_point_store.h
@@ -194,6 +194,7 @@ struct IntersectionIndexInfo {
   std::vector<uint32_t> self_dup_cnt;
   std::vector<uint32_t> peer_dup_cnt;
 };
+
 IntersectionIndexInfo ComputeIndicesWithDupCnt(
     const std::shared_ptr<UbPsiClientCacheMemoryStore>& self,
     const std::shared_ptr<UbPsiClientCacheFileStore>& peer, size_t batch_size);
diff --git a/psi/utils/emp_io_adapter.cc b/psi/utils/emp_io_adapter.cc
deleted file mode 100644
index b7f67f7..0000000
--- a/psi/utils/emp_io_adapter.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-// Copyright 2021 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "psi/utils/emp_io_adapter.h"
-
-#include <utility>
-
-#include "emp-tool/utils/block.h"
-#include "emp-tool/utils/group.h"
-#include "spdlog/spdlog.h"
-#include "yacl/base/byte_container_view.h"
-#include "yacl/base/int128.h"
-#include "yacl/link/link.h"
-
-namespace psi {
-
-EmpIoAdapter::EmpIoAdapter(std::shared_ptr<yacl::link::Context> ctx)
-    : ctx_(std::move(ctx)),
-      send_op_(0),
-      recv_op_(0),
-      send_buffer_used_(0),
-      recv_buffer_used_(0) {
-  send_buffer_.resize(SEND_BUFFER_SIZE);
-}
-
-EmpIoAdapter::~EmpIoAdapter() {
-  try {
-    flush();
-  } catch (const std::exception &e) {
-    SPDLOG_ERROR("Error in flush: {}", e.what());
-  }
-}
-
-void EmpIoAdapter::flush() {
-  if (send_buffer_used_ == 0) {
-    return;
-  }
-
-  ctx_->SendAsyncThrottled(
-      ctx_->NextRank(),
-      yacl::ByteContainerView(send_buffer_.data(), send_buffer_used_),
-      fmt::format("Cheetah send:{}", send_op_++));
-
-  memset(send_buffer_.data(), 0, SEND_BUFFER_SIZE);
-  send_buffer_used_ = 0;
-}
-
-void EmpIoAdapter::fill_recv() {
-  recv_buffer_ =
-      ctx_->Recv(ctx_->NextRank(), fmt::format("Cheetah recv:{}", recv_op_++));
-  recv_buffer_used_ = 0;
-}
-
-void EmpIoAdapter::send_data_internal(const void *data, int len) {
-  size_t send_buffer_left = SEND_BUFFER_SIZE - send_buffer_used_;
-  if (send_buffer_left <= static_cast<size_t>(len)) {
-    memcpy(send_buffer_.data() + send_buffer_used_, data, send_buffer_left);
-    send_buffer_used_ += send_buffer_left;
-    flush();
-
-    send_data_internal(static_cast<const char *>(data) + send_buffer_left,
-                       len - send_buffer_left);
-  } else {
-    memcpy(send_buffer_.data() + send_buffer_used_, data, len);
-    send_buffer_used_ += len;
-  }
-}
-
-void EmpIoAdapter::recv_data_internal(void *data, int len) {
-  if (send_buffer_used_ > 0) {
-    flush();
-  }
-
-  size_t recv_buffer_left = recv_buffer_.size() - recv_buffer_used_;
-  if (recv_buffer_left >= static_cast<size_t>(len)) {
-    memcpy(data, recv_buffer_.data<uint8_t>() + recv_buffer_used_, len);
-    recv_buffer_used_ += len;
-  } else {
-    if (recv_buffer_.size() != 0) {
-      memcpy(data, recv_buffer_.data<uint8_t>() + recv_buffer_used_,
-             recv_buffer_left);
-    }
-    fill_recv();
-
-    recv_data_internal(static_cast<char *>(data) + recv_buffer_left,
-                       len - recv_buffer_left);
-  }
-}
-
-template <typename T>
-void EmpIoAdapter::send_data_partial(const T *data, int len, int bitlength) {
-  if (bitlength == sizeof(T) * 8) {
-    send_data_internal(static_cast<const void *>(data), len * sizeof(T));
-    return;
-  }
-
-  int compact_len = (bitlength + 7) / 8;
-  std::vector<uint8_t> bytes(len);
-  for (int i = 0; i < compact_len; i++) {
-    for (int j = 0; j < len; j++) {
-      bytes[j] = static_cast<uint8_t>(data[j] >> (i * 8));
-    }
-    send_data_internal(bytes.data(), len);
-  }
-}
-
-template <typename T>
-void EmpIoAdapter::recv_data_partial(T *data, int len, int bitlength) {
-  if (bitlength == sizeof(T) * 8) {
-    recv_data_internal(static_cast<void *>(data), len * sizeof(T));
-    return;
-  }
-  memset(data, 0, len * sizeof(T));
-
-  int compact_len = (bitlength + 7) / 8;
-  std::vector<uint8_t> bytes(len);
-  for (int i = 0; i < compact_len; i++) {
-    recv_data_internal(bytes.data(), len);
-    for (int j = 0; j < len; j++) {
-      data[j] |= static_cast<T>(bytes[j]) << (i * 8);
-    }
-  }
-  T mask = (static_cast<T>(1) << bitlength) - 1;
-  for (int i = 0; i < len; i++) {
-    data[i] &= mask;
-  }
-}
-
-template void EmpIoAdapter::send_data_partial<uint32_t>(const uint32_t *data,
-                                                        int len, int bitlength);
-template void EmpIoAdapter::send_data_partial<uint64_t>(const uint64_t *data,
-                                                        int len, int bitlength);
-template void EmpIoAdapter::send_data_partial<uint128_t>(const uint128_t *data,
-                                                         int len,
-                                                         int bitlength);
-
-template void EmpIoAdapter::recv_data_partial<uint32_t>(uint32_t *data, int len,
-                                                        int bitlength);
-template void EmpIoAdapter::recv_data_partial<uint64_t>(uint64_t *data, int len,
-                                                        int bitlength);
-template void EmpIoAdapter::recv_data_partial<uint128_t>(uint128_t *data,
-                                                         int len,
-                                                         int bitlength);
-
-}  // namespace psi
diff --git a/psi/utils/emp_io_adapter.h b/psi/utils/emp_io_adapter.h
deleted file mode 100644
index dd81a30..0000000
--- a/psi/utils/emp_io_adapter.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright 2021 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "emp-tool/io/io_channel.h"
-#include "yacl/base/buffer.h"
-#include "yacl/link/link.h"
-
-namespace psi {
-
-class EmpIoAdapter : public emp::IOChannel<EmpIoAdapter> {
- public:
-  std::shared_ptr<yacl::link::Context> ctx_;
-
-  const static uint64_t SEND_BUFFER_SIZE = 1024 * 1024;
-  uint32_t send_op_;
-  uint32_t recv_op_;
-
-  std::vector<uint8_t> send_buffer_;
-  uint64_t send_buffer_used_;
-
-  yacl::Buffer recv_buffer_;
-  uint64_t recv_buffer_used_;
-
-  explicit EmpIoAdapter(std::shared_ptr<yacl::link::Context> ctx);
-
-  ~EmpIoAdapter();
-
-  void flush();
-
-  void fill_recv();
-
-  void send_data_internal(const void* data, int len);
-
-  void recv_data_internal(void* data, int len);
-
-  template <typename T>
-  void send_data_partial(const T* data, int len, int bitlength);
-
-  template <typename T>
-  void recv_data_partial(T* data, int len, int bitlength);
-};
-
-}  // namespace psi
diff --git a/psi/utils/emp_io_adapter_test.cc b/psi/utils/emp_io_adapter_test.cc
deleted file mode 100644
index 06a521b..0000000
--- a/psi/utils/emp_io_adapter_test.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2021 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "psi/utils/emp_io_adapter.h"
-
-#include <future>
-#include <thread>
-
-#include "gtest/gtest.h"
-#include "yacl/link/test_util.h"
-
-namespace psi {
-
-TEST(EmpIoAdapterTest, Test) {
-  const int kWorldSize = 2;
-  auto contexts = yacl::link::test::SetupWorld(kWorldSize);
-
-  std::future<void> player1 = std::async([&] {
-    char msg[100];
-    EmpIoAdapter io(contexts[0]);
-    io.send_data("hello", 5);
-    io.send_data(" world", 6);
-
-    io.recv_data(msg, 7);
-
-    std::cout << "player1 receive: " << msg << std::endl;
-  });
-
-  std::future<void> player2 = std::async([&] {
-    char msg[100];
-    EmpIoAdapter io(contexts[1]);
-    io.recv_data(msg, 11);
-
-    io.send_data("goodbye", 7);
-
-    std::cout << "player2 receive: " << msg << std::endl;
-  });
-
-  player1.get();
-  player2.get();
-}
-
-TEST(EmpIoAdapterTest, TestPartial) {
-  const int kWorldSize = 2;
-  auto contexts = yacl::link::test::SetupWorld(kWorldSize);
-
-  std::future<void> player1 = std::async([&] {
-    uint64_t a[4] = {0x284, 0xf3a, 0x97e4, 0x8fa};
-    EmpIoAdapter io(contexts[0]);
-    io.send_data_partial(a, 4, 12);
-
-    std::cout << "player1 sends: " << a[0] << ", " << a[1] << ", "
-              << (a[2] & ((1 << 12) - 1)) << ", " << a[3] << std::endl;
-  });
-
-  std::future<void> player2 = std::async([&] {
-    uint64_t a[4];
-    EmpIoAdapter io(contexts[1]);
-    io.recv_data_partial(a, 4, 12);
-    std::cout << "player2 receives: " << a[0] << ", " << a[1] << ", " << a[2]
-              << ", " << a[3] << std::endl;
-  });
-
-  player1.get();
-  player2.get();
-}
-
-}  // namespace psi
diff --git a/psi/utils/index_store_test.cc b/psi/utils/index_store_test.cc
index c8c4d45..40ee0e2 100644
--- a/psi/utils/index_store_test.cc
+++ b/psi/utils/index_store_test.cc
@@ -117,7 +117,10 @@ TEST_F(IndexStoreTest, Works) {
 }
 
 TEST_F(IndexStoreTest, Empty) {
-  { IndexWriter writer(index_store_path_); }
+  {
+    // format off
+    IndexWriter writer(index_store_path_);
+  }
 
   {
     FileIndexReader reader(index_store_path_);
diff --git a/psi/utils/key.cc b/psi/utils/key.cc
index 5ab1143..33c43dd 100644
--- a/psi/utils/key.cc
+++ b/psi/utils/key.cc
@@ -20,7 +20,7 @@
 #include "spdlog/spdlog.h"
 #include "yacl/base/exception.h"
 
-#include "psi/utils/csv_header_parser.h"
+#include "psi/utils/arrow_helper.h"
 #include "psi/utils/io.h"
 
 namespace psi {
@@ -39,7 +39,8 @@ namespace psi {
 void MultiKeySort(const std::string& in_csv, const std::string& out_csv,
                   const std::vector<std::string>& keys, bool numeric_sort,
                   bool unique) {
-  CsvHeaderParser parser(in_csv);
+  auto csv_reader = MakeCsvReader(in_csv);
+  auto schema = csv_reader->schema();
 
   std::string line;
   {
@@ -58,9 +59,12 @@ void MultiKeySort(const std::string& in_csv, const std::string& out_csv,
   }
 
   // Construct sort key indices.
-  // NOTE: `sort` cmd starts from index 1.
   std::vector<std::string> sort_keys;
-  for (size_t index : parser.target_indices(keys, 1)) {
+  for (auto key : keys) {
+    auto index = schema->GetFieldIndex(key);
+    YACL_ENFORCE(index >= 0, "field {} is not found in {}", key, in_csv);
+    // NOTE: `sort` cmd starts from index 1.
+    auto sort_index = index + 1;
     // About `sort --key=KEYDEF`
     //
     // KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where
@@ -74,7 +78,7 @@ void MultiKeySort(const std::string& in_csv, const std::string& out_csv,
     //
     // I have already verified `sort --key=3,3 --key=1,1` will firstly sort by
     // 3rd field and then 1st field.
-    sort_keys.push_back(fmt::format("--key={},{}", index, index));
+    sort_keys.push_back(fmt::format("--key={},{}", sort_index, sort_index));
   }
   YACL_ENFORCE(sort_keys.size() == keys.size(),
                "mismatched header, field_names={}, line={}",
diff --git a/psi/version.h b/psi/version.h
index 932e31b..d157ba4 100644
--- a/psi/version.h
+++ b/psi/version.h
@@ -15,6 +15,6 @@
 #pragma once
 
 #define PSI_VERSION_MAJOR 0
-#define PSI_VERSION_MINOR 5
+#define PSI_VERSION_MINOR 6
 #define PSI_VERSION_PATCH 0
-#define PSI_DEV_IDENTIFIER ".dev241111"
+#define PSI_DEV_IDENTIFIER ".dev241212"
diff --git a/psi/apsi_wrapper/BUILD.bazel b/psi/wrapper/apsi/BUILD.bazel
similarity index 83%
rename from psi/apsi_wrapper/BUILD.bazel
rename to psi/wrapper/apsi/BUILD.bazel
index 0a091c8..61ed934 100644
--- a/psi/apsi_wrapper/BUILD.bazel
+++ b/psi/wrapper/apsi/BUILD.bazel
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("//bazel:psi.bzl", "psi_cc_library", "psi_cc_test")
+load("//bazel:psi.bzl", "psi_cc_library")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -21,7 +21,7 @@ psi_cc_library(
     srcs = ["receiver.cc"],
     hdrs = ["receiver.h"],
     deps = [
-        "@com_github_microsoft_apsi//:apsi",
+        "@apsi",
     ],
 )
 
@@ -30,7 +30,7 @@ psi_cc_library(
     srcs = ["sender.cc"],
     hdrs = ["sender.h"],
     deps = [
-        "@com_github_microsoft_apsi//:apsi",
+        "@apsi",
     ],
 )
 
@@ -40,7 +40,7 @@ psi_cc_library(
     srcs = ["test_utils.cc"],
     hdrs = ["test_utils.h"],
     deps = [
-        "@com_github_microsoft_apsi//:apsi",
+        "@apsi",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -50,7 +50,7 @@ psi_cc_library(
     srcs = ["yacl_channel.cc"],
     hdrs = ["yacl_channel.h"],
     deps = [
-        "@com_github_microsoft_apsi//:apsi",
+        "@apsi",
         "@yacl//yacl/link",
     ],
 )
diff --git a/psi/apsi_wrapper/README.md b/psi/wrapper/apsi/README.md
similarity index 100%
rename from psi/apsi_wrapper/README.md
rename to psi/wrapper/apsi/README.md
diff --git a/psi/apsi_wrapper/api/BUILD.bazel b/psi/wrapper/apsi/api/BUILD.bazel
similarity index 82%
rename from psi/apsi_wrapper/api/BUILD.bazel
rename to psi/wrapper/apsi/api/BUILD.bazel
index 02cefde..b45b8fc 100644
--- a/psi/apsi_wrapper/api/BUILD.bazel
+++ b/psi/wrapper/apsi/api/BUILD.bazel
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("//bazel:psi.bzl", "psi_cc_binary", "psi_cc_library", "psi_cc_test")
+load("//bazel:psi.bzl", "psi_cc_library", "psi_cc_test")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -36,11 +36,11 @@ psi_cc_library(
     srcs = ["sender.cc"],
     hdrs = ["sender.h"],
     deps = [
-        "//psi/apsi_wrapper:sender",
-        "//psi/apsi_wrapper/utils:bucket",
-        "//psi/apsi_wrapper/utils:common",
-        "//psi/apsi_wrapper/utils:group_db",
-        "//psi/apsi_wrapper/utils:sender_db",
+        "//psi/wrapper/apsi:sender",
+        "//psi/wrapper/apsi/utils:bucket",
+        "//psi/wrapper/apsi/utils:common",
+        "//psi/wrapper/apsi/utils:group_db",
+        "//psi/wrapper/apsi/utils:sender_db",
     ],
 )
 
@@ -59,9 +59,9 @@ psi_cc_library(
     srcs = ["receiver.cc"],
     hdrs = ["receiver.h"],
     deps = [
-        "//psi/apsi_wrapper:receiver",
-        "//psi/apsi_wrapper/utils:group_db",
-        "//psi/apsi_wrapper/utils:sender_db",
+        "//psi/wrapper/apsi:receiver",
+        "//psi/wrapper/apsi/utils:group_db",
+        "//psi/wrapper/apsi/utils:sender_db",
     ],
 )
 
@@ -86,7 +86,7 @@ psi_cc_test(
     deps = [
         ":receiver_c_wrapper",
         ":sender_c_wrapper",
-        "@boost//:uuid",
+        "@boost.uuid//:boost.uuid",
     ],
 )
 
@@ -101,7 +101,7 @@ psi_cc_test(
     deps = [
         ":receiver",
         ":sender",
-        "@boost//:uuid",
+        "@boost.uuid//:boost.uuid",
     ],
 )
 
@@ -116,6 +116,6 @@ psi_cc_test(
     deps = [
         ":receiver",
         ":sender",
-        "@boost//:uuid",
+        "@boost.uuid//:boost.uuid",
     ],
 )
diff --git a/psi/apsi_wrapper/api/api_test.cc b/psi/wrapper/apsi/api/api_test.cc
similarity index 95%
rename from psi/apsi_wrapper/api/api_test.cc
rename to psi/wrapper/apsi/api/api_test.cc
index 84951c0..0c04d84 100644
--- a/psi/apsi_wrapper/api/api_test.cc
+++ b/psi/wrapper/apsi/api/api_test.cc
@@ -25,10 +25,10 @@
 #include "gtest/gtest.h"
 #include "spdlog/spdlog.h"
 
-#include "psi/apsi_wrapper/api/receiver.h"
-#include "psi/apsi_wrapper/api/sender.h"
-#include "psi/apsi_wrapper/sender.h"
-#include "psi/apsi_wrapper/utils/csv_reader.h"
+#include "psi/wrapper/apsi/api/receiver.h"
+#include "psi/wrapper/apsi/api/sender.h"
+#include "psi/wrapper/apsi/sender.h"
+#include "psi/wrapper/apsi/utils/csv_reader.h"
 
 namespace psi::apsi_wrapper::api {
 
diff --git a/psi/apsi_wrapper/api/api_test_label.cc b/psi/wrapper/apsi/api/api_test_label.cc
similarity index 96%
rename from psi/apsi_wrapper/api/api_test_label.cc
rename to psi/wrapper/apsi/api/api_test_label.cc
index d0230fa..987c1e5 100644
--- a/psi/apsi_wrapper/api/api_test_label.cc
+++ b/psi/wrapper/apsi/api/api_test_label.cc
@@ -24,10 +24,10 @@
 #include "gtest/gtest.h"
 #include "spdlog/spdlog.h"
 
-#include "psi/apsi_wrapper/api/receiver.h"
-#include "psi/apsi_wrapper/api/sender.h"
-#include "psi/apsi_wrapper/sender.h"
-#include "psi/apsi_wrapper/utils/csv_reader.h"
+#include "psi/wrapper/apsi/api/receiver.h"
+#include "psi/wrapper/apsi/api/sender.h"
+#include "psi/wrapper/apsi/sender.h"
+#include "psi/wrapper/apsi/utils/csv_reader.h"
 
 namespace psi::apsi_wrapper::api {
 
diff --git a/psi/apsi_wrapper/api/exported_symbols.lds b/psi/wrapper/apsi/api/exported_symbols.lds
similarity index 100%
rename from psi/apsi_wrapper/api/exported_symbols.lds
rename to psi/wrapper/apsi/api/exported_symbols.lds
diff --git a/psi/apsi_wrapper/api/receiver.cc b/psi/wrapper/apsi/api/receiver.cc
similarity index 98%
rename from psi/apsi_wrapper/api/receiver.cc
rename to psi/wrapper/apsi/api/receiver.cc
index c884320..3d94832 100644
--- a/psi/apsi_wrapper/api/receiver.cc
+++ b/psi/wrapper/apsi/api/receiver.cc
@@ -15,7 +15,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-#include "psi/apsi_wrapper/api/receiver.h"
+#include "psi/wrapper/apsi/api/receiver.h"
 
 #include <memory>
 #include <utility>
@@ -25,9 +25,9 @@
 #include "apsi/network/sender_operation_response.h"
 #include "spdlog/spdlog.h"
 
-#include "psi/apsi_wrapper/receiver.h"
-#include "psi/apsi_wrapper/utils/common.h"
-#include "psi/apsi_wrapper/utils/sender_db.h"
+#include "psi/wrapper/apsi/receiver.h"
+#include "psi/wrapper/apsi/utils/common.h"
+#include "psi/wrapper/apsi/utils/sender_db.h"
 
 using namespace std;
 
diff --git a/psi/apsi_wrapper/api/receiver.h b/psi/wrapper/apsi/api/receiver.h
similarity index 98%
rename from psi/apsi_wrapper/api/receiver.h
rename to psi/wrapper/apsi/api/receiver.h
index cc13c03..0e76426 100644
--- a/psi/apsi_wrapper/api/receiver.h
+++ b/psi/wrapper/apsi/api/receiver.h
@@ -24,7 +24,7 @@
 
 #include "apsi/psi_params.h"
 
-#include "psi/apsi_wrapper/receiver.h"
+#include "psi/wrapper/apsi/receiver.h"
 
 namespace psi::apsi_wrapper::api {
 
diff --git a/psi/apsi_wrapper/api/receiver_c_wrapper.cc b/psi/wrapper/apsi/api/receiver_c_wrapper.cc
similarity index 97%
rename from psi/apsi_wrapper/api/receiver_c_wrapper.cc
rename to psi/wrapper/apsi/api/receiver_c_wrapper.cc
index 358adfe..8cb28e8 100644
--- a/psi/apsi_wrapper/api/receiver_c_wrapper.cc
+++ b/psi/wrapper/apsi/api/receiver_c_wrapper.cc
@@ -15,7 +15,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-#include "psi/apsi_wrapper/api/receiver_c_wrapper.h"
+#include "psi/wrapper/apsi/api/receiver_c_wrapper.h"
 
 #include <memory>
 #include <numeric>
@@ -23,8 +23,8 @@
 
 #include "spdlog/spdlog.h"
 
-#include "psi/apsi_wrapper/api/receiver.h"
-#include "psi/apsi_wrapper/api/wrapper_util.h"
+#include "psi/wrapper/apsi/api/receiver.h"
+#include "psi/wrapper/apsi/api/wrapper_util.h"
 
 using ApiReceiver = psi::apsi_wrapper::api::Receiver;
 using ApiQueryCtx =
diff --git a/psi/apsi_wrapper/api/receiver_c_wrapper.h b/psi/wrapper/apsi/api/receiver_c_wrapper.h
similarity index 97%
rename from psi/apsi_wrapper/api/receiver_c_wrapper.h
rename to psi/wrapper/apsi/api/receiver_c_wrapper.h
index 546ceda..0a70b2a 100644
--- a/psi/apsi_wrapper/api/receiver_c_wrapper.h
+++ b/psi/wrapper/apsi/api/receiver_c_wrapper.h
@@ -22,7 +22,7 @@ extern "C" {
 #include <stddef.h>
 #include <stdint.h>
 
-#include "psi/apsi_wrapper/api/wrapper_common.h"
+#include "psi/wrapper/apsi/api/wrapper_common.h"
 
 struct Receiver;
 struct QueryCtx;
diff --git a/psi/apsi_wrapper/api/sender.cc b/psi/wrapper/apsi/api/sender.cc
similarity index 98%
rename from psi/apsi_wrapper/api/sender.cc
rename to psi/wrapper/apsi/api/sender.cc
index 1ecca81..2c5281f 100644
--- a/psi/apsi_wrapper/api/sender.cc
+++ b/psi/wrapper/apsi/api/sender.cc
@@ -15,17 +15,17 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-#include "psi/apsi_wrapper/api/sender.h"
+#include "psi/wrapper/apsi/api/sender.h"
 
 #include <cstddef>
 #include <filesystem>
 
 #include "apsi/network/stream_channel.h"
 
-#include "psi/apsi_wrapper/sender.h"
-#include "psi/apsi_wrapper/utils/common.h"
-#include "psi/apsi_wrapper/utils/sender_db.h"
 #include "psi/utils/multiplex_disk_cache.h"
+#include "psi/wrapper/apsi/sender.h"
+#include "psi/wrapper/apsi/utils/common.h"
+#include "psi/wrapper/apsi/utils/sender_db.h"
 
 using namespace std;
 
diff --git a/psi/apsi_wrapper/api/sender.h b/psi/wrapper/apsi/api/sender.h
similarity index 96%
rename from psi/apsi_wrapper/api/sender.h
rename to psi/wrapper/apsi/api/sender.h
index d92c68c..c8bbfe6 100644
--- a/psi/apsi_wrapper/api/sender.h
+++ b/psi/wrapper/apsi/api/sender.h
@@ -24,8 +24,8 @@
 #include "apsi/requests.h"
 #include "apsi/responses.h"
 
-#include "psi/apsi_wrapper/utils/bucket.h"
-#include "psi/apsi_wrapper/utils/group_db.h"
+#include "psi/wrapper/apsi/utils/bucket.h"
+#include "psi/wrapper/apsi/utils/group_db.h"
 
 namespace psi::apsi_wrapper::api {
 
diff --git a/psi/apsi_wrapper/api/sender_c_wrapper.cc b/psi/wrapper/apsi/api/sender_c_wrapper.cc
similarity index 95%
rename from psi/apsi_wrapper/api/sender_c_wrapper.cc
rename to psi/wrapper/apsi/api/sender_c_wrapper.cc
index 9ff72d1..88dcdb9 100644
--- a/psi/apsi_wrapper/api/sender_c_wrapper.cc
+++ b/psi/wrapper/apsi/api/sender_c_wrapper.cc
@@ -15,15 +15,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-#include "psi/apsi_wrapper/api/sender_c_wrapper.h"
+#include "psi/wrapper/apsi/api/sender_c_wrapper.h"
 
 #include <memory>
 
 #include "sender.h"
 #include "yacl/base/exception.h"
 
-#include "psi/apsi_wrapper/api/sender.h"
-#include "psi/apsi_wrapper/api/wrapper_util.h"
+#include "psi/wrapper/apsi/api/sender.h"
+#include "psi/wrapper/apsi/api/wrapper_util.h"
 
 using ApiSender = psi::apsi_wrapper::api::Sender;
 
diff --git a/psi/apsi_wrapper/api/sender_c_wrapper.h b/psi/wrapper/apsi/api/sender_c_wrapper.h
similarity index 96%
rename from psi/apsi_wrapper/api/sender_c_wrapper.h
rename to psi/wrapper/apsi/api/sender_c_wrapper.h
index 290c51c..5c34b7a 100644
--- a/psi/apsi_wrapper/api/sender_c_wrapper.h
+++ b/psi/wrapper/apsi/api/sender_c_wrapper.h
@@ -22,7 +22,7 @@ extern "C" {
 #include <stddef.h>
 #include <stdint.h>
 
-#include "psi/apsi_wrapper/api/wrapper_common.h"
+#include "psi/wrapper/apsi/api/wrapper_common.h"
 
 struct Sender;
 
diff --git a/psi/apsi_wrapper/api/wrapper_common.h b/psi/wrapper/apsi/api/wrapper_common.h
similarity index 100%
rename from psi/apsi_wrapper/api/wrapper_common.h
rename to psi/wrapper/apsi/api/wrapper_common.h
diff --git a/psi/apsi_wrapper/api/wrapper_test.cc b/psi/wrapper/apsi/api/wrapper_test.cc
similarity index 96%
rename from psi/apsi_wrapper/api/wrapper_test.cc
rename to psi/wrapper/apsi/api/wrapper_test.cc
index 95f7b43..5bfcf36 100644
--- a/psi/apsi_wrapper/api/wrapper_test.cc
+++ b/psi/wrapper/apsi/api/wrapper_test.cc
@@ -25,9 +25,9 @@
 #include "gtest/gtest.h"
 #include "spdlog/spdlog.h"
 
-#include "psi/apsi_wrapper/api/receiver_c_wrapper.h"
-#include "psi/apsi_wrapper/api/sender_c_wrapper.h"
-#include "psi/apsi_wrapper/utils/csv_reader.h"
+#include "psi/wrapper/apsi/api/receiver_c_wrapper.h"
+#include "psi/wrapper/apsi/api/sender_c_wrapper.h"
+#include "psi/wrapper/apsi/utils/csv_reader.h"
 
 namespace psi::apsi_wrapper::api {
 
diff --git a/psi/apsi_wrapper/api/wrapper_util.cc b/psi/wrapper/apsi/api/wrapper_util.cc
similarity index 95%
rename from psi/apsi_wrapper/api/wrapper_util.cc
rename to psi/wrapper/apsi/api/wrapper_util.cc
index 6b19678..7666d67 100644
--- a/psi/apsi_wrapper/api/wrapper_util.cc
+++ b/psi/wrapper/apsi/api/wrapper_util.cc
@@ -15,7 +15,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-#include "psi/apsi_wrapper/api/wrapper_util.h"
+#include "psi/wrapper/apsi/api/wrapper_util.h"
 
 #include <stdlib.h>
 
@@ -59,4 +59,4 @@ std::vector<std::string> MakeStringArray(CStringArray strs) {
   return ret;
 }
 
-}  // namespace psi::apsi_wrapper::api
\ No newline at end of file
+}  // namespace psi::apsi_wrapper::api
diff --git a/psi/apsi_wrapper/api/wrapper_util.h b/psi/wrapper/apsi/api/wrapper_util.h
similarity index 91%
rename from psi/apsi_wrapper/api/wrapper_util.h
rename to psi/wrapper/apsi/api/wrapper_util.h
index ba483dc..cb1b0e7 100644
--- a/psi/apsi_wrapper/api/wrapper_util.h
+++ b/psi/wrapper/apsi/api/wrapper_util.h
@@ -20,7 +20,7 @@
 #include <string>
 #include <vector>
 
-#include "psi/apsi_wrapper/api/wrapper_common.h"
+#include "psi/wrapper/apsi/api/wrapper_common.h"
 
 namespace psi::apsi_wrapper::api {
 
@@ -32,4 +32,4 @@ CStringArray MakeCStringArray(const std::vector<std::string> &strs);
 
 std::vector<std::string> MakeStringArray(CStringArray);
 
-}  // namespace psi::apsi_wrapper::api
\ No newline at end of file
+}  // namespace psi::apsi_wrapper::api
diff --git a/psi/apsi_wrapper/cli/BUILD.bazel b/psi/wrapper/apsi/cli/BUILD.bazel
similarity index 67%
rename from psi/apsi_wrapper/cli/BUILD.bazel
rename to psi/wrapper/apsi/cli/BUILD.bazel
index 6cc8a35..a09f277 100644
--- a/psi/apsi_wrapper/cli/BUILD.bazel
+++ b/psi/wrapper/apsi/cli/BUILD.bazel
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_cc//cc:defs.bzl", "cc_proto_library")
-load("@rules_proto//proto:defs.bzl", "proto_library")
-load("//bazel:psi.bzl", "psi_cc_binary", "psi_cc_library", "psi_cc_test")
+load("//bazel:psi.bzl", "psi_cc_binary", "psi_cc_library")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -23,7 +21,7 @@ psi_cc_library(
     srcs = ["common_utils.cc"],
     hdrs = ["common_utils.h"],
     deps = [
-        "@com_github_microsoft_apsi//:apsi",
+        "@apsi",
     ],
 )
 
@@ -32,11 +30,11 @@ psi_cc_library(
     srcs = ["sender_dispatcher.cc"],
     hdrs = ["sender_dispatcher.h"],
     deps = [
-        "//psi/apsi_wrapper:sender",
-        "//psi/apsi_wrapper:yacl_channel",
-        "//psi/apsi_wrapper/utils:bucket",
-        "//psi/apsi_wrapper/utils:group_db",
-        "@com_github_microsoft_apsi//:apsi",
+        "//psi/wrapper/apsi:sender",
+        "//psi/wrapper/apsi:yacl_channel",
+        "//psi/wrapper/apsi/utils:bucket",
+        "//psi/wrapper/apsi/utils:group_db",
+        "@apsi",
     ],
 )
 
@@ -47,16 +45,16 @@ psi_cc_library(
     deps = [
         ":common_utils",
         ":sender_dispatcher",
-        "//psi/apsi_wrapper:receiver",
-        "//psi/apsi_wrapper:sender",
-        "//psi/apsi_wrapper:yacl_channel",
-        "//psi/apsi_wrapper/utils:bucket",
-        "//psi/apsi_wrapper/utils:common",
-        "//psi/apsi_wrapper/utils:csv_reader",
-        "//psi/apsi_wrapper/utils:group_db",
-        "//psi/apsi_wrapper/utils:sender_db",
         "//psi/proto:pir_cc_proto",
         "//psi/utils:resource_manager",
+        "//psi/wrapper/apsi:receiver",
+        "//psi/wrapper/apsi:sender",
+        "//psi/wrapper/apsi:yacl_channel",
+        "//psi/wrapper/apsi/utils:bucket",
+        "//psi/wrapper/apsi/utils:common",
+        "//psi/wrapper/apsi/utils:csv_reader",
+        "//psi/wrapper/apsi/utils:group_db",
+        "//psi/wrapper/apsi/utils:sender_db",
         "@yacl//yacl/utils:parallel",
     ],
 )
diff --git a/psi/apsi_wrapper/cli/common_utils.cc b/psi/wrapper/apsi/cli/common_utils.cc
similarity index 98%
rename from psi/apsi_wrapper/cli/common_utils.cc
rename to psi/wrapper/apsi/cli/common_utils.cc
index a07a1f2..2770439 100644
--- a/psi/apsi_wrapper/cli/common_utils.cc
+++ b/psi/wrapper/apsi/cli/common_utils.cc
@@ -15,7 +15,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-#include "psi/apsi_wrapper/cli/common_utils.h"
+#include "psi/wrapper/apsi/cli/common_utils.h"
 
 // STD
 #include <iomanip>
diff --git a/psi/apsi_wrapper/cli/common_utils.h b/psi/wrapper/apsi/cli/common_utils.h
similarity index 100%
rename from psi/apsi_wrapper/cli/common_utils.h
rename to psi/wrapper/apsi/cli/common_utils.h
diff --git a/psi/apsi_wrapper/cli/entry.cc b/psi/wrapper/apsi/cli/entry.cc
similarity index 97%
rename from psi/apsi_wrapper/cli/entry.cc
rename to psi/wrapper/apsi/cli/entry.cc
index 1793fdd..f0c3dff 100644
--- a/psi/apsi_wrapper/cli/entry.cc
+++ b/psi/wrapper/apsi/cli/entry.cc
@@ -15,7 +15,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-#include "psi/apsi_wrapper/cli/entry.h"
+#include "psi/wrapper/apsi/cli/entry.h"
 
 #include "spdlog/details/os.h"
 
@@ -44,17 +44,17 @@
 #include "apsi/network/zmq/zmq_channel.h"
 #include "apsi/thread_pool_mgr.h"
 
-#include "psi/apsi_wrapper/cli/common_utils.h"
-#include "psi/apsi_wrapper/cli/sender_dispatcher.h"
-#include "psi/apsi_wrapper/receiver.h"
-#include "psi/apsi_wrapper/utils/bucket.h"
-#include "psi/apsi_wrapper/utils/common.h"
-#include "psi/apsi_wrapper/utils/csv_reader.h"
-#include "psi/apsi_wrapper/utils/group_db.h"
-#include "psi/apsi_wrapper/utils/sender_db.h"
-#include "psi/apsi_wrapper/yacl_channel.h"
 #include "psi/utils/multiplex_disk_cache.h"
 #include "psi/utils/resource_manager.h"
+#include "psi/wrapper/apsi/cli/common_utils.h"
+#include "psi/wrapper/apsi/cli/sender_dispatcher.h"
+#include "psi/wrapper/apsi/receiver.h"
+#include "psi/wrapper/apsi/utils/bucket.h"
+#include "psi/wrapper/apsi/utils/common.h"
+#include "psi/wrapper/apsi/utils/csv_reader.h"
+#include "psi/wrapper/apsi/utils/group_db.h"
+#include "psi/wrapper/apsi/utils/sender_db.h"
+#include "psi/wrapper/apsi/yacl_channel.h"
 
 using namespace std;
 
diff --git a/psi/apsi_wrapper/cli/entry.h b/psi/wrapper/apsi/cli/entry.h
similarity index 100%
rename from psi/apsi_wrapper/cli/entry.h
rename to psi/wrapper/apsi/cli/entry.h
diff --git a/psi/apsi_wrapper/cli/receiver.cc b/psi/wrapper/apsi/cli/receiver.cc
similarity index 97%
rename from psi/apsi_wrapper/cli/receiver.cc
rename to psi/wrapper/apsi/cli/receiver.cc
index 691d68a..cdc7dc4 100644
--- a/psi/apsi_wrapper/cli/receiver.cc
+++ b/psi/wrapper/apsi/cli/receiver.cc
@@ -17,8 +17,8 @@
 
 #include "gflags/gflags.h"
 
-#include "psi/apsi_wrapper/cli/common_utils.h"
-#include "psi/apsi_wrapper/cli/entry.h"
+#include "psi/wrapper/apsi/cli/common_utils.h"
+#include "psi/wrapper/apsi/cli/entry.h"
 
 DEFINE_uint64(threads, 0, "Number of threads to use");
 DEFINE_string(log_file, "/tmp/receiver.log", "Log file path");
diff --git a/psi/apsi_wrapper/cli/sender.cc b/psi/wrapper/apsi/cli/sender.cc
similarity index 97%
rename from psi/apsi_wrapper/cli/sender.cc
rename to psi/wrapper/apsi/cli/sender.cc
index 6cab239..2b45398 100644
--- a/psi/apsi_wrapper/cli/sender.cc
+++ b/psi/wrapper/apsi/cli/sender.cc
@@ -17,8 +17,8 @@
 
 #include "gflags/gflags.h"
 
-#include "psi/apsi_wrapper/cli/common_utils.h"
-#include "psi/apsi_wrapper/cli/entry.h"
+#include "psi/wrapper/apsi/cli/common_utils.h"
+#include "psi/wrapper/apsi/cli/entry.h"
 
 using namespace std;
 
diff --git a/psi/apsi_wrapper/cli/sender_dispatcher.cc b/psi/wrapper/apsi/cli/sender_dispatcher.cc
similarity index 98%
rename from psi/apsi_wrapper/cli/sender_dispatcher.cc
rename to psi/wrapper/apsi/cli/sender_dispatcher.cc
index 09d15c8..7ffd34a 100644
--- a/psi/apsi_wrapper/cli/sender_dispatcher.cc
+++ b/psi/wrapper/apsi/cli/sender_dispatcher.cc
@@ -16,7 +16,7 @@
 // Licensed under the MIT license.
 
 // STD
-#include "psi/apsi_wrapper/cli/sender_dispatcher.h"
+#include "psi/wrapper/apsi/cli/sender_dispatcher.h"
 
 #include <cassert>
 #include <cstddef>
@@ -24,9 +24,9 @@
 #include <stdexcept>
 #include <thread>
 
-#include "psi/apsi_wrapper/sender.h"
-#include "psi/apsi_wrapper/utils/bucket.h"
-#include "psi/apsi_wrapper/yacl_channel.h"
+#include "psi/wrapper/apsi/sender.h"
+#include "psi/wrapper/apsi/utils/bucket.h"
+#include "psi/wrapper/apsi/yacl_channel.h"
 
 // APSI
 #include "apsi/log.h"
diff --git a/psi/apsi_wrapper/cli/sender_dispatcher.h b/psi/wrapper/apsi/cli/sender_dispatcher.h
similarity index 96%
rename from psi/apsi_wrapper/cli/sender_dispatcher.h
rename to psi/wrapper/apsi/cli/sender_dispatcher.h
index 356c373..d236c3e 100644
--- a/psi/apsi_wrapper/cli/sender_dispatcher.h
+++ b/psi/wrapper/apsi/cli/sender_dispatcher.h
@@ -29,9 +29,9 @@
 #include "apsi/oprf/oprf_sender.h"
 #include "apsi/sender_db.h"
 
-#include "psi/apsi_wrapper/utils/bucket.h"
-#include "psi/apsi_wrapper/utils/group_db.h"
-#include "psi/apsi_wrapper/yacl_channel.h"
+#include "psi/wrapper/apsi/utils/bucket.h"
+#include "psi/wrapper/apsi/utils/group_db.h"
+#include "psi/wrapper/apsi/yacl_channel.h"
 
 namespace psi::apsi_wrapper::cli {
 /**
diff --git a/psi/apsi_wrapper/receiver.cc b/psi/wrapper/apsi/receiver.cc
similarity index 99%
rename from psi/apsi_wrapper/receiver.cc
rename to psi/wrapper/apsi/receiver.cc
index 8e303cc..6d211ff 100644
--- a/psi/apsi_wrapper/receiver.cc
+++ b/psi/wrapper/apsi/receiver.cc
@@ -15,7 +15,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-#include "psi/apsi_wrapper/receiver.h"
+#include "psi/wrapper/apsi/receiver.h"
 
 // STD
 #include <algorithm>
diff --git a/psi/apsi_wrapper/receiver.h b/psi/wrapper/apsi/receiver.h
similarity index 100%
rename from psi/apsi_wrapper/receiver.h
rename to psi/wrapper/apsi/receiver.h
diff --git a/psi/apsi_wrapper/sender.cc b/psi/wrapper/apsi/sender.cc
similarity index 99%
rename from psi/apsi_wrapper/sender.cc
rename to psi/wrapper/apsi/sender.cc
index 1a409ed..a0e2a88 100644
--- a/psi/apsi_wrapper/sender.cc
+++ b/psi/wrapper/apsi/sender.cc
@@ -15,7 +15,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-#include "psi/apsi_wrapper/sender.h"
+#include "psi/wrapper/apsi/sender.h"
 
 // STD
 #include <future>
diff --git a/psi/apsi_wrapper/sender.h b/psi/wrapper/apsi/sender.h
similarity index 100%
rename from psi/apsi_wrapper/sender.h
rename to psi/wrapper/apsi/sender.h
diff --git a/psi/apsi_wrapper/test_utils.cc b/psi/wrapper/apsi/test_utils.cc
similarity index 99%
rename from psi/apsi_wrapper/test_utils.cc
rename to psi/wrapper/apsi/test_utils.cc
index f6c0a6e..94a79c9 100644
--- a/psi/apsi_wrapper/test_utils.cc
+++ b/psi/wrapper/apsi/test_utils.cc
@@ -16,7 +16,7 @@
 // Licensed under the MIT license.
 
 // APSI
-#include "psi/apsi_wrapper/test_utils.h"
+#include "psi/wrapper/apsi/test_utils.h"
 
 // STD
 #include <algorithm>
diff --git a/psi/apsi_wrapper/test_utils.h b/psi/wrapper/apsi/test_utils.h
similarity index 100%
rename from psi/apsi_wrapper/test_utils.h
rename to psi/wrapper/apsi/test_utils.h
diff --git a/psi/apsi_wrapper/utils/BUILD.bazel b/psi/wrapper/apsi/utils/BUILD.bazel
similarity index 92%
rename from psi/apsi_wrapper/utils/BUILD.bazel
rename to psi/wrapper/apsi/utils/BUILD.bazel
index dc9fef8..1bc7149 100644
--- a/psi/apsi_wrapper/utils/BUILD.bazel
+++ b/psi/wrapper/apsi/utils/BUILD.bazel
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 load("@rules_proto//proto:defs.bzl", "proto_library")
-load("//bazel:psi.bzl", "psi_cc_library", "psi_cc_test")
+load("//bazel:psi.bzl", "psi_cc_library")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -32,7 +32,7 @@ psi_cc_library(
     srcs = ["common.cc"],
     hdrs = ["common.h"],
     deps = [
-        "@com_github_microsoft_apsi//:apsi",
+        "@apsi",
         "@org_apache_arrow//:arrow",
     ],
 )
@@ -74,6 +74,6 @@ psi_cc_library(
     hdrs = ["bucket.h"],
     deps = [
         ":sender_db",
-        "@com_github_microsoft_apsi//:apsi",
+        "@apsi",
     ],
 )
diff --git a/psi/apsi_wrapper/utils/bucket.cc b/psi/wrapper/apsi/utils/bucket.cc
similarity index 95%
rename from psi/apsi_wrapper/utils/bucket.cc
rename to psi/wrapper/apsi/utils/bucket.cc
index b9eb91d..cb4c2a3 100644
--- a/psi/apsi_wrapper/utils/bucket.cc
+++ b/psi/wrapper/apsi/utils/bucket.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/apsi_wrapper/utils/bucket.h"
+#include "psi/wrapper/apsi/utils/bucket.h"
 
 #include <filesystem>
 #include <string>
@@ -20,7 +20,7 @@
 #include "apsi/log.h"
 #include "fmt/format.h"
 
-#include "psi/apsi_wrapper/utils/sender_db.h"
+#include "psi/wrapper/apsi/utils/sender_db.h"
 
 namespace psi::apsi_wrapper {
 
diff --git a/psi/apsi_wrapper/utils/bucket.h b/psi/wrapper/apsi/utils/bucket.h
similarity index 100%
rename from psi/apsi_wrapper/utils/bucket.h
rename to psi/wrapper/apsi/utils/bucket.h
diff --git a/psi/apsi_wrapper/utils/common.cc b/psi/wrapper/apsi/utils/common.cc
similarity index 99%
rename from psi/apsi_wrapper/utils/common.cc
rename to psi/wrapper/apsi/utils/common.cc
index 6e45890..08c585e 100644
--- a/psi/apsi_wrapper/utils/common.cc
+++ b/psi/wrapper/apsi/utils/common.cc
@@ -15,7 +15,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-#include "psi/apsi_wrapper/utils/common.h"
+#include "psi/wrapper/apsi/utils/common.h"
 
 #include <ios>
 #include <utility>
@@ -258,4 +258,4 @@ int print_intersection_results(
   return match_cnt;
 }
 
-}  // namespace psi::apsi_wrapper
\ No newline at end of file
+}  // namespace psi::apsi_wrapper
diff --git a/psi/apsi_wrapper/utils/common.h b/psi/wrapper/apsi/utils/common.h
similarity index 100%
rename from psi/apsi_wrapper/utils/common.h
rename to psi/wrapper/apsi/utils/common.h
diff --git a/psi/apsi_wrapper/utils/csv_reader.cc b/psi/wrapper/apsi/utils/csv_reader.cc
similarity index 99%
rename from psi/apsi_wrapper/utils/csv_reader.cc
rename to psi/wrapper/apsi/utils/csv_reader.cc
index 10f007d..ad96237 100644
--- a/psi/apsi_wrapper/utils/csv_reader.cc
+++ b/psi/wrapper/apsi/utils/csv_reader.cc
@@ -15,7 +15,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-#include "psi/apsi_wrapper/utils/csv_reader.h"
+#include "psi/wrapper/apsi/utils/csv_reader.h"
 
 #include "fmt/format.h"
 #include "fmt/ranges.h"
@@ -40,7 +40,7 @@
 // APSI
 #include "apsi/log.h"
 
-#include "psi/apsi_wrapper/utils/common.h"
+#include "psi/wrapper/apsi/utils/common.h"
 
 using namespace std;
 using namespace apsi;
diff --git a/psi/apsi_wrapper/utils/csv_reader.h b/psi/wrapper/apsi/utils/csv_reader.h
similarity index 97%
rename from psi/apsi_wrapper/utils/csv_reader.h
rename to psi/wrapper/apsi/utils/csv_reader.h
index 89ab1c2..0da004b 100644
--- a/psi/apsi_wrapper/utils/csv_reader.h
+++ b/psi/wrapper/apsi/utils/csv_reader.h
@@ -34,8 +34,8 @@
 #include "arrow/csv/api.h"
 #include "arrow/io/api.h"
 
-#include "psi/apsi_wrapper/utils/common.h"
 #include "psi/utils/multiplex_disk_cache.h"
+#include "psi/wrapper/apsi/utils/common.h"
 
 namespace psi::apsi_wrapper {
 
diff --git a/psi/apsi_wrapper/utils/group_db.cc b/psi/wrapper/apsi/utils/group_db.cc
similarity index 99%
rename from psi/apsi_wrapper/utils/group_db.cc
rename to psi/wrapper/apsi/utils/group_db.cc
index 7e29194..91e4471 100644
--- a/psi/apsi_wrapper/utils/group_db.cc
+++ b/psi/wrapper/apsi/utils/group_db.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "psi/apsi_wrapper/utils/group_db.h"
+#include "psi/wrapper/apsi/utils/group_db.h"
 
 #include <apsi/psi_params.h>
 #include <fcntl.h>
@@ -40,8 +40,8 @@
 #include "spdlog/spdlog.h"
 #include "yacl/base/exception.h"
 
-#include "psi/apsi_wrapper/utils/common.h"
-#include "psi/apsi_wrapper/utils/csv_reader.h"
+#include "psi/wrapper/apsi/utils/common.h"
+#include "psi/wrapper/apsi/utils/csv_reader.h"
 
 namespace psi::apsi_wrapper {
 
diff --git a/psi/apsi_wrapper/utils/group_db.h b/psi/wrapper/apsi/utils/group_db.h
similarity index 97%
rename from psi/apsi_wrapper/utils/group_db.h
rename to psi/wrapper/apsi/utils/group_db.h
index d7602c6..b7a9e0f 100644
--- a/psi/apsi_wrapper/utils/group_db.h
+++ b/psi/wrapper/apsi/utils/group_db.h
@@ -24,9 +24,9 @@
 #include <unordered_map>
 #include <vector>
 
-#include "psi/apsi_wrapper/utils/sender_db.h"
+#include "psi/wrapper/apsi/utils/sender_db.h"
 
-#include "psi/apsi_wrapper/utils/group_db_status.pb.h"
+#include "psi/wrapper/apsi/utils/group_db_status.pb.h"
 
 namespace psi::apsi_wrapper {
 
diff --git a/psi/apsi_wrapper/utils/group_db_status.proto b/psi/wrapper/apsi/utils/group_db_status.proto
similarity index 100%
rename from psi/apsi_wrapper/utils/group_db_status.proto
rename to psi/wrapper/apsi/utils/group_db_status.proto
diff --git a/psi/apsi_wrapper/utils/sender_db.cc b/psi/wrapper/apsi/utils/sender_db.cc
similarity index 99%
rename from psi/apsi_wrapper/utils/sender_db.cc
rename to psi/wrapper/apsi/utils/sender_db.cc
index 3d0c943..8624309 100644
--- a/psi/apsi_wrapper/utils/sender_db.cc
+++ b/psi/wrapper/apsi/utils/sender_db.cc
@@ -15,14 +15,14 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-#include "psi/apsi_wrapper/utils/sender_db.h"
+#include "psi/wrapper/apsi/utils/sender_db.h"
 
 #include <fstream>
 #include <iostream>
 #include <string_view>
 #include <utility>
 
-#include "psi/apsi_wrapper/utils/common.h"
+#include "psi/wrapper/apsi/utils/common.h"
 
 #if defined(__GNUC__) && (__GNUC__ < 8) && !defined(__clang__)
 #include <experimental/filesystem>
diff --git a/psi/apsi_wrapper/utils/sender_db.h b/psi/wrapper/apsi/utils/sender_db.h
similarity index 98%
rename from psi/apsi_wrapper/utils/sender_db.h
rename to psi/wrapper/apsi/utils/sender_db.h
index f65f6ef..5255315 100644
--- a/psi/apsi_wrapper/utils/sender_db.h
+++ b/psi/wrapper/apsi/utils/sender_db.h
@@ -21,7 +21,7 @@
 #include "apsi/psi_params.h"
 #include "apsi/sender_db.h"
 
-#include "psi/apsi_wrapper/utils/csv_reader.h"
+#include "psi/wrapper/apsi/utils/csv_reader.h"
 
 namespace psi::apsi_wrapper {
 
diff --git a/psi/apsi_wrapper/yacl_channel.cc b/psi/wrapper/apsi/yacl_channel.cc
similarity index 99%
rename from psi/apsi_wrapper/yacl_channel.cc
rename to psi/wrapper/apsi/yacl_channel.cc
index e09dfe9..037f13c 100644
--- a/psi/apsi_wrapper/yacl_channel.cc
+++ b/psi/wrapper/apsi/yacl_channel.cc
@@ -15,7 +15,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-#include "psi/apsi_wrapper/yacl_channel.h"
+#include "psi/wrapper/apsi/yacl_channel.h"
 
 // STD
 #include <cstddef>
diff --git a/psi/apsi_wrapper/yacl_channel.h b/psi/wrapper/apsi/yacl_channel.h
similarity index 100%
rename from psi/apsi_wrapper/yacl_channel.h
rename to psi/wrapper/apsi/yacl_channel.h