diff --git a/.bazeliskrc b/.bazeliskrc
new file mode 100644
index 0000000..f4c1884
--- /dev/null
+++ b/.bazeliskrc
@@ -0,0 +1 @@
+USE_BAZEL_VERSION=7.4.1
\ No newline at end of file
diff --git a/.bazelrc b/.bazelrc
index a73f8c7..ec2e009 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -15,6 +15,10 @@
common --experimental_repo_remote_exec
common --experimental_cc_shared_library
+common --registry=https://raw.githubusercontent.com/secretflow/bazel-registry/main
+common --registry=https://bcr.bazel.build
+common --registry=https://baidu.github.io/babylon/registry
+
build --incompatible_new_actions_api=false
build --copt=-fdiagnostics-color=always
build --enable_platform_specific_config
diff --git a/.bazelversion b/.bazelversion
deleted file mode 100644
index f22d756..0000000
--- a/.bazelversion
+++ /dev/null
@@ -1 +0,0 @@
-6.5.0
diff --git a/.clang-tidy b/.clang-tidy
index a82670a..e422034 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -76,3 +76,4 @@ CheckOptions:
- key: performance-unnecessary-value-param.AllowedTypes
value: PtBufferView
+
diff --git a/.gitignore b/.gitignore
index 0f9ce30..248050f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,7 @@ Pipfile
# bazel
bazel-*
+MODULE.bazel.lock
# cmake related
abseil-cpp
@@ -44,4 +45,4 @@ rpc_data
coverity*/
idir/
-ossutil_output/
\ No newline at end of file
+ossutil_output/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..02d6def
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,116 @@
+# Contribution guidelines
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project.
+
+## Style
+
+### C++ coding style
+
+In general, please use clang-format to format code, and follow clang-tidy tips.
+
+Most of the code style is derived from the
+[Google C++ style guidelines](https://google.github.io/styleguide/cppguide.html), except:
+
+- Exceptions are allowed and encouraged where appropriate.
+- Header guards should use `#pragma once`.
+- Adopt [camelBack](https://llvm.org/docs/Proposals/VariableNames.html#variable-names-coding-standard-options)
+ for function names.
+- Use [fixed width integer types](https://en.cppreference.com/w/cpp/types/integer) whenever possible.
+- Avoid using size_t on interface APIs.
+
+The compiler portion of the project follows [MLIR style](https://mlir.llvm.org/getting_started/DeveloperGuide/#style-guide).
+
+### Other tips
+
+- Git commit message should be meaningful, we suggest imperative [keywords](https://github.com/joelparkerhenderson/git_commit_message#summary-keywords).
+- Developer must write unit-test (line coverage must be greater than 80%), tests should be deterministic.
+- Read awesome [Abseil Tips](https://abseil.io/tips/)
+
+## Build
+
+### Prerequisite
+
+
+#### Docker
+
+```sh
+## start container
+docker run -d -it --name psi-dev-$(whoami) \
+ --mount type=bind,source="$(pwd)",target=/home/admin/dev/ \
+ -w /home/admin/dev \
+ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
+ --cap-add=NET_ADMIN \
+ --privileged=true \
+ secretflow/ubuntu-base-ci:latest \
+ bash
+
+# attach to build container
+docker exec -it psi-dev-$(whoami) bash
+```
+
+#### Linux
+
+```sh
+Install gcc>=11.2, cmake>=3.26, ninja, nasm>=2.15, python>=3.10, bazelisk, xxd, lld
+```
+
+#### macOS
+
+```sh
+# macOS >= 13.0, Xcode >= 15.0
+
+# Install Xcode
+https://apps.apple.com/us/app/xcode/id497799835?mt=12
+
+# Select Xcode toolchain version
+sudo xcode-select -s /Applications/Xcode.app/Contents/Developer
+
+# Install homebrew
+https://brew.sh/
+
+# Install dependencies
+# Be aware, brew may install a newer version of bazel, when that happens bazel will give an error message during build.
+# Please follow instructions in the error message to install the required version
+brew install bazelisk cmake ninja libomp wget
+
+# For Intel mac only
+brew install nasm
+```
+
+### Build & UnitTest
+
+
+
+
+``` sh
+# build as debug
+bazel build //... -c dbg
+
+# build as release
+bazel build //... -c opt
+
+# test
+bazel test //...
+
+# [optional] build & test with ASAN or UBSAN, for macOS users please use configs with macOS prefix
+bazel test //... --features=asan
+bazel test //... --features=ubsan
+```
+
+### Bazel build options
+
+- `--define gperf=on` enable gperf
+
+### Build docs
+
+```sh
+# prerequisite
+pip install -U -r docs/requirements.txt
+
+cd docs && make html # html docs will be in docs/_build/html
+```
diff --git a/MODULE.bazel b/MODULE.bazel
new file mode 100644
index 0000000..0cea1dd
--- /dev/null
+++ b/MODULE.bazel
@@ -0,0 +1,83 @@
+###############################################################################
+# Bazel now uses Bzlmod by default to manage external dependencies.
+# Please consider migrating your external dependencies from WORKSPACE to MODULE.bazel.
+#
+# For more details, please check https://github.com/bazelbuild/bazel/issues/18958
+###############################################################################
+
+module(
+ name = "psi",
+ version = "0.6.0.dev241212",
+ compatibility_level = 1,
+)
+
+bazel_dep(name = "yacl", version = "20241212.0-871832a")
+
+single_version_override(
+ module_name = "grpc",
+ patch_strip = 1,
+ patches = [
+ "//bazel/patches:grpc-1.66.patch",
+ "//bazel/patches:grpc-module-file.patch",
+ ],
+ version = "1.66.0.bcr.3",
+)
+
+bazel_dep(name = "platforms", version = "0.0.8")
+bazel_dep(name = "apple_support", version = "1.17.1")
+bazel_dep(name = "rules_cc", version = "0.0.13")
+bazel_dep(name = "rules_proto", version = "6.0.0.bcr.1")
+bazel_dep(name = "rules_foreign_cc", version = "0.12.0")
+bazel_dep(name = "protobuf", version = "27.3")
+bazel_dep(name = "spdlog", version = "1.14.1")
+bazel_dep(name = "fmt", version = "11.0.2")
+bazel_dep(name = "abseil-cpp", version = "20240722.0")
+bazel_dep(name = "gflags", version = "2.2.2")
+bazel_dep(name = "rapidjson", version = "1.1.0.bcr.20241007")
+bazel_dep(name = "boost.math", version = "1.83.0")
+bazel_dep(name = "boost.uuid", version = "1.83.0")
+bazel_dep(name = "boost.algorithm", version = "1.83.0.bcr.1")
+bazel_dep(name = "boost.multiprecision", version = "1.83.0")
+bazel_dep(name = "zlib", version = "1.3.1.bcr.3")
+
+# --registry=https://baidu.github.io/babylon/registry
+bazel_dep(name = "openssl", version = "3.3.2")
+
+# self-host registry
+bazel_dep(name = "org_interconnection", version = "0.0.1")
+bazel_dep(name = "fourqlib", version = "0.0.0-20220901-1031567")
+bazel_dep(name = "arrow", version = "10.0.0", repo_name = "org_apache_arrow")
+bazel_dep(name = "ippcp", version = "2021.8")
+bazel_dep(name = "libdivide", version = "5.0")
+bazel_dep(name = "emp-tool", version = "0.2.5")
+bazel_dep(name = "sparsehash", version = "2.0.4")
+bazel_dep(name = "sse2neon", version = "1.7.0-20240330-8df2f48")
+
+# non mododule dependencies
+non_module_dependencies = use_extension("//bazel:defs.bzl", "non_module_dependencies")
+use_repo(
+ non_module_dependencies,
+ "apsi",
+ "curve25519-donna",
+ "kuku",
+ "perfetto",
+ "seal",
+ "zstd",
+)
+
+new_local_repository = use_repo_rule("@bazel_tools//tools/build_defs/repo:local.bzl", "new_local_repository")
+new_local_repository(
+ name = "macos_omp_x64",
+ build_file = "@yacl//bazel:local_openmp_macos.BUILD",
+ path = "/usr/local/opt/libomp",
+)
+
+new_local_repository(
+ name = "macos_omp_arm64",
+ build_file = "@yacl//bazel:local_openmp_macos.BUILD",
+ path = "/opt/homebrew/opt/libomp/",
+)
+
+# test
+bazel_dep(name = "googletest", version = "1.15.2", dev_dependency = True, repo_name = "com_google_googletest")
+bazel_dep(name = "google_benchmark", version = "1.8.5", dev_dependency = True, repo_name = "com_github_google_benchmark")
diff --git a/README.md b/README.md
index b2cc375..d09ca45 100644
--- a/README.md
+++ b/README.md
@@ -122,7 +122,7 @@ In the first terminal, run the following command
docker run -it --rm --network host --mount type=bind,source=/tmp/receiver,target=/root/receiver --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=NET_ADMIN --privileged=true secretflow-registry.cn-hangzhou.cr.aliyuncs.com/secretflow/psi-anolis8:latest --config receiver/receiver.config
```
-In the other terminal, run the following command simultaneously.
+In the other terminal, run the following command simultaneously.
```bash
docker run -it --rm --network host --mount type=bind,source=/tmp/sender,target=/root/sender --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=NET_ADMIN --privileged=true secretflow-registry.cn-hangzhou.cr.aliyuncs.com/secretflow/psi-anolis8:latest --config sender/sender.config
@@ -166,7 +166,7 @@ Install gcc>=11.2, cmake>=3.26, ninja, nasm>=2.15, python>=3.8, bazel, golang, x
```
> **Note**
-Please install bazel with version in .bazelversion or use bazelisk.
+Please install bazel with version in .bazeliskrc or use bazelisk.
### Build & UnitTest
@@ -213,3 +213,4 @@ Please refer to [PSI V2 Benchmark](docs/user_guide/psi_v2_benchmark.md)
## APSI Benchmark
Please refer to [APSI Benchmark](docs/user_guide/apsi_benchmark.md)
+
diff --git a/RELEASE.md b/RELEASE.md
index d20f8ba..c8327d1 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -40,7 +40,7 @@
## v0.3.0beta
- [Improvement] add uuid in system temp folder.
- [Improvement] use arrow csv reader in pir.
-- [Bugfix] fix typo in psi config check.
+- [Bugfix] fix typo in psi config check.
## v0.3.0.dev240304
- [API] expose ic_mode in RunLegacyPsi api
diff --git a/WORKSPACE b/WORKSPACE
deleted file mode 100644
index 3771892..0000000
--- a/WORKSPACE
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2021 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-workspace(name = "psi")
-
-load("//bazel:repositories.bzl", "psi_deps")
-
-psi_deps()
-
-#
-# yacl
-# Warning: psi relies on yacl to bring in common 3p libraries.
-# Please make sure yacl_deps are called right after psi_deps.
-#
-load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
-load("@yacl//bazel:repositories.bzl", "yacl_deps")
-
-yacl_deps()
-
-load("@rules_python//python:repositories.bzl", "py_repositories")
-
-py_repositories()
-
-load(
- "@rules_foreign_cc//foreign_cc:repositories.bzl",
- "rules_foreign_cc_dependencies",
-)
-
-rules_foreign_cc_dependencies(
- register_built_tools = False,
- register_default_tools = False,
- register_preinstalled_tools = True,
-)
-
-load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
-
-grpc_deps()
-
-# Not mentioned in official docs... mentioned here https://github.com/grpc/grpc/issues/20511
-load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
-
-grpc_extra_deps()
-
-#
-# boost
-#
-load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps")
-
-boost_deps()
diff --git a/bazel/arrow.BUILD b/bazel/arrow.BUILD
deleted file mode 100644
index c8fd52c..0000000
--- a/bazel/arrow.BUILD
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright 2023 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# copied from https://github.com/tensorflow/io/blob/master/third_party/arrow.BUILD and made some changes
-# Description:
-# Apache Arrow library
-
-load("@com_github_grpc_grpc//bazel:cc_grpc_library.bzl", "cc_grpc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"]) # Apache 2.0
-
-exports_files(["LICENSE.txt"])
-
-genrule(
- name = "arrow_util_config",
- srcs = ["cpp/src/arrow/util/config.h.cmake"],
- outs = ["cpp/src/arrow/util/config.h"],
- cmd = ("sed " +
- "-e 's/@ARROW_VERSION_MAJOR@/9/g' " +
- "-e 's/@ARROW_VERSION_MINOR@/0/g' " +
- "-e 's/@ARROW_VERSION_PATCH@/0/g' " +
- "-e 's/cmakedefine ARROW_USE_NATIVE_INT128/undef ARROW_USE_NATIVE_INT128/g' " +
- "-e 's/cmakedefine ARROW_WITH_OPENTELEMETRY/undef ARROW_WITH_OPENTELEMETRY/g' " +
- "-e 's/cmakedefine ARROW_GCS/undef ARROW_GCS/g' " +
- "-e 's/cmakedefine ARROW_S3/undef ARROW_S3/g' " +
- "-e 's/cmakedefine ARROW_JEMALLOC/undef ARROW_JEMALLOC/g' " +
- "-e 's/cmakedefine ARROW_JEMALLOC_VENDORED/undef ARROW_JEMALLOC_VENDORED/g' " +
- "-e 's/cmakedefine/define/g' " +
- "$< >$@"),
-)
-
-genrule(
- name = "parquet_version_h",
- srcs = ["cpp/src/parquet/parquet_version.h.in"],
- outs = ["cpp/src/parquet/parquet_version.h"],
- cmd = ("sed " +
- "-e 's/@PARQUET_VERSION_MAJOR@/1/g' " +
- "-e 's/@PARQUET_VERSION_MINOR@/5/g' " +
- "-e 's/@PARQUET_VERSION_PATCH@/1/g' " +
- "$< >$@"),
-)
-
-cc_library(
- name = "arrow_vendored",
- srcs = glob([
- "cpp/src/arrow/vendored/datetime/*.h",
- "cpp/src/arrow/vendored/datetime/*.cpp",
- "cpp/src/arrow/vendored/pcg/pcg_uint128.hpp",
- "cpp/src/arrow/vendored/pcg/pcg_random.hpp",
- "cpp/src/arrow/vendored/pcg/pcg_extras.hpp",
- "cpp/src/arrow/vendored/uriparser/*.h",
- "cpp/src/arrow/vendored/uriparser/*.c",
- ]),
- includes = [
- "cpp/src",
- ],
- visibility = ["//visibility:private"],
-)
-
-cc_library(
- name = "arrow",
- srcs = glob(
- [
- "cpp/src/arrow/*.cc",
- "cpp/src/arrow/c/*.cc",
- "cpp/src/arrow/array/*.cc",
- "cpp/src/arrow/csv/*.cc",
- "cpp/src/arrow/extension/**/*.cc",
- "cpp/src/arrow/extension/**/*.h",
- "cpp/src/arrow/io/*.cc",
- "cpp/src/arrow/ipc/*.cc",
- "cpp/src/arrow/json/*.cc",
- "cpp/src/arrow/tensor/*.cc",
- "cpp/src/arrow/compute/**/*.cc",
- "cpp/src/arrow/util/*.cc",
- "cpp/src/arrow/vendored/optional.hpp",
- "cpp/src/arrow/vendored/string_view.hpp",
- "cpp/src/arrow/vendored/variant.hpp",
- "cpp/src/arrow/vendored/base64.cpp",
- "cpp/src/arrow/**/*.h",
- "cpp/src/parquet/**/*.h",
- "cpp/src/parquet/**/*.cc",
- "cpp/src/generated/*.h",
- "cpp/src/generated/*.cpp",
- "cpp/thirdparty/flatbuffers/include/flatbuffers/*.h",
- ],
- exclude = [
- "cpp/src/**/*_benchmark.cc",
- "cpp/src/**/*_main.cc",
- "cpp/src/**/*_nossl.cc",
- "cpp/src/**/*_test.cc",
- "cpp/src/**/test_*.h",
- "cpp/src/**/test_*.cc",
- "cpp/src/**/benchmark_util.h",
- "cpp/src/**/benchmark_util.cc",
- "cpp/src/**/*hdfs*.cc",
- "cpp/src/**/*fuzz*.cc",
- "cpp/src/arrow/memory_pool_jemalloc.cc",
- "cpp/src/**/file_to_stream.cc",
- "cpp/src/**/stream_to_file.cc",
- "cpp/src/arrow/dataset/file_orc*",
- "cpp/src/arrow/filesystem/gcsfs*.cc",
- "cpp/src/arrow/filesystem/s3*.cc",
- "cpp/src/arrow/filesystem/*_test_util.cc",
- "cpp/src/arrow/util/bpacking_avx2.cc",
- "cpp/src/arrow/util/bpacking_avx512.cc",
- "cpp/src/arrow/util/bpacking_neon.cc",
- "cpp/src/arrow/util/tracing_internal.cc",
- "cpp/src/arrow/compute/**/*_avx2.cc",
- ],
- ),
- hdrs = [
- # declare header from above genrule
- "cpp/src/arrow/util/config.h",
- "cpp/src/parquet/parquet_version.h",
- ],
- copts = [],
- defines = [
- "ARROW_WITH_BROTLI",
- "ARROW_WITH_SNAPPY",
- "ARROW_WITH_LZ4",
- "ARROW_WITH_ZLIB",
- "ARROW_WITH_ZSTD",
- "ARROW_WITH_BZ2",
- "ARROW_STATIC",
- "ARROW_EXPORT=",
- "PARQUET_STATIC",
- "PARQUET_EXPORT=",
- ],
- includes = [
- "cpp/src",
- "cpp/src/arrow/vendored/xxhash",
- "cpp/thirdparty/flatbuffers/include",
- ],
- linkopts = ["-lpthread"],
- textual_hdrs = [
- "cpp/src/arrow/vendored/xxhash/xxhash.c",
- ],
- deps = [
- ":arrow_vendored",
- "@boost//:multiprecision",
- "@brotli",
- "@bzip2",
- "@com_github_facebook_zstd//:zstd",
- "@com_github_gflags_gflags//:gflags",
- "@com_github_google_snappy//:snappy",
- "@com_github_grpc_grpc//:grpc++",
- "@com_github_grpc_grpc//:grpc++_reflection",
- "@com_github_lz4_lz4//:lz4",
- "@com_github_tencent_rapidjson//:rapidjson",
- "@com_github_xtensor_xsimd//:xsimd",
- "@com_google_double_conversion//:double-conversion",
- "@org_apache_thrift//:thrift",
- "@zlib",
- ],
-)
diff --git a/bazel/brotli.BUILD b/bazel/brotli.BUILD
deleted file mode 100644
index c586412..0000000
--- a/bazel/brotli.BUILD
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright 2023 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# copied from https://github.com/tensorflow/io/blob/v0.25.0/third_party/brotli.BUILD
-# Description:
-# Brotli library
-
-licenses(["notice"]) # MIT license
-
-exports_files(["LICENSE"])
-
-cc_library(
- name = "brotli",
- srcs = glob([
- "c/common/*.c",
- "c/common/*.h",
- "c/dec/*.c",
- "c/dec/*.h",
- "c/enc/*.c",
- "c/enc/*.h",
- "c/include/brotli/*.h",
- ]),
- hdrs = [],
- defines = [],
- includes = [
- "c/dec",
- "c/include",
- ],
- linkopts = [],
- visibility = ["//visibility:public"],
-)
diff --git a/bazel/bzip2.BUILD b/bazel/bzip2.BUILD
deleted file mode 100644
index fc618d3..0000000
--- a/bazel/bzip2.BUILD
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2023 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# copied from https://github.com/tensorflow/io/blob/v0.25.0/third_party/bzip2.BUILD
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"]) # BSD-like license
-
-cc_library(
- name = "bzip2",
- srcs = [
- "blocksort.c",
- "bzlib.c",
- "bzlib_private.h",
- "compress.c",
- "crctable.c",
- "decompress.c",
- "huffman.c",
- "randtable.c",
- ],
- hdrs = [
- "bzlib.h",
- ],
- copts = [
- ],
- includes = ["."],
-)
diff --git a/bazel/rapidjson.BUILD b/bazel/defs.bzl
similarity index 59%
rename from bazel/rapidjson.BUILD
rename to bazel/defs.bzl
index 86748d0..a758117 100644
--- a/bazel/rapidjson.BUILD
+++ b/bazel/defs.bzl
@@ -1,4 +1,4 @@
-# Copyright 2023 Ant Group Co., Ltd.
+# Copyright 2024 Ant Group Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -11,20 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+load("//bazel:repositories.bzl", "psi_deps")
-# copied from https://github.com/tensorflow/io/blob/v0.25.0/third_party/rapidjson.BUILD
+def _non_module_deps_impl(_module_ctx):
+ psi_deps()
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"]) # MIT/JSON license
-
-cc_library(
- name = "rapidjson",
- srcs = glob([
- "include/**/*.h",
- ]),
- copts = [],
- includes = [
- "include",
- ],
+non_module_dependencies = module_extension(
+ implementation = _non_module_deps_impl,
)
diff --git a/bazel/emp-tool.BUILD b/bazel/emp-tool.BUILD
deleted file mode 100644
index 57a2c77..0000000
--- a/bazel/emp-tool.BUILD
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2022 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@yacl//bazel:yacl.bzl", "yacl_cmake_external")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
- name = "all_srcs",
- srcs = glob(["**"]),
-)
-
-yacl_cmake_external(
- name = "emp-tool",
- cache_entries = {
- "OPENSSL_ROOT_DIR": "$EXT_BUILD_DEPS/openssl",
- "BUILD_TESTING": "OFF",
- },
- lib_source = ":all_srcs",
- out_data_dirs = ["cmake"],
- out_static_libs = [
- "libemp-tool.a",
- ],
- deps = [
- "@com_github_openssl_openssl//:openssl",
- ],
-)
diff --git a/bazel/gperftools.BUILD b/bazel/gperftools.BUILD
index b4314dd..4ace425 100644
--- a/bazel/gperftools.BUILD
+++ b/bazel/gperftools.BUILD
@@ -1,5 +1,4 @@
load("@rules_foreign_cc//foreign_cc:defs.bzl", "configure_make")
-load("@rules_cc//cc:defs.bzl", "cc_library")
package(default_visibility = ["//visibility:public"])
diff --git a/bazel/hexl.BUILD b/bazel/hexl.BUILD
deleted file mode 100644
index 5fbe1e1..0000000
--- a/bazel/hexl.BUILD
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2022 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
- name = "all_srcs",
- srcs = glob(["**"]),
-)
-
-cmake(
- name = "hexl",
- cache_entries = {
- "CMAKE_BUILD_TYPE": "Release",
- "CpuFeatures_DIR": "$EXT_BUILD_DEPS/cpu_features/lib/cmake/CpuFeatures/",
- "HEXL_BENCHMARK": "OFF",
- "HEXL_TESTING": "OFF",
- "CMAKE_INSTALL_LIBDIR": "lib",
- },
- generate_args = ["-GNinja"],
- lib_source = ":all_srcs",
- out_data_dirs = ["lib/cmake"],
- out_static_libs = ["libhexl.a"],
- deps = [
- "@com_github_google_cpu_features//:cpu_features",
- ],
-)
diff --git a/bazel/ipp.BUILD b/bazel/ipp.BUILD
deleted file mode 100644
index 56e308f..0000000
--- a/bazel/ipp.BUILD
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright 2022 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@yacl//bazel:yacl.bzl", "yacl_cmake_external")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
- name = "all_srcs",
- srcs = glob(["**"]),
-)
-
-yacl_cmake_external(
- name = "ipp",
- cache_entries = {
- "ARCH": "intel64",
- "OPENSSL_INCLUDE_DIR": "$EXT_BUILD_DEPS/openssl/include",
- "OPENSSL_LIBRARIES": "$EXT_BUILD_DEPS/openssl/lib",
- "OPENSSL_ROOT_DIR": "$EXT_BUILD_DEPS/openssl",
- "CMAKE_BUILD_TYPE": "Release",
- },
- lib_source = ":all_srcs",
- out_static_libs = [
- "intel64/libippcp.a",
- "intel64/libcrypto_mb.a",
- ],
- deps = [
- "@com_github_openssl_openssl//:openssl",
- ],
-)
diff --git a/bazel/jsoncpp.BUILD b/bazel/jsoncpp.BUILD
index 53134c0..0c096e8 100644
--- a/bazel/jsoncpp.BUILD
+++ b/bazel/jsoncpp.BUILD
@@ -30,10 +30,10 @@ cmake(
"BUILD_OBJECT_LIBS": "OFF",
"CMAKE_INSTALL_LIBDIR": "lib",
},
- generate_args = ["-GNinja"],
env = {
"CCACHE_DISABLE": "1",
},
+ generate_args = ["-GNinja"],
lib_source = "@com_github_open_source_parsers_jsoncpp//:all",
out_static_libs = ["libjsoncpp.a"],
)
diff --git a/bazel/libdivide.BUILD b/bazel/libdivide.BUILD
deleted file mode 100644
index c044063..0000000
--- a/bazel/libdivide.BUILD
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2022 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
- name = "all_srcs",
- srcs = glob(["**"]),
-)
-
-cmake(
- name = "libdivide",
- cache_entries = {
- "BUILD_TESTS": "OFF",
- },
- generate_args = ["-GNinja"],
- lib_source = ":all_srcs",
- out_headers_only = True,
- out_include_dir = "include",
-)
diff --git a/bazel/lz4.BUILD b/bazel/lz4.BUILD
deleted file mode 100644
index 80f3e37..0000000
--- a/bazel/lz4.BUILD
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2023 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
- name = "all_srcs",
- srcs = glob(["**"]),
-)
-
-cmake(
- name = "lz4",
- cache_entries = {
- "LZ4_BUILD_CLI": "OFF",
- "BUILD_SHARED_LIBS": "OFF",
- "BUILD_STATIC_LIBS": "ON",
- "CMAKE_INSTALL_LIBDIR": "lib",
- },
- generate_args = ["-GNinja"],
- lib_source = ":all_srcs",
- out_static_libs = [
- "liblz4.a",
- ],
- working_directory = "build/cmake",
-)
diff --git a/bazel/microsoft_apsi.BUILD b/bazel/microsoft_apsi.BUILD
index 4dcebc9..02fd14f 100644
--- a/bazel/microsoft_apsi.BUILD
+++ b/bazel/microsoft_apsi.BUILD
@@ -30,19 +30,19 @@ cmake(
"EXT_BUILD_DEPS": "$EXT_BUILD_DEPS",
},
generate_args = ["-GNinja"],
- lib_source = "@com_github_microsoft_apsi//:all",
+ lib_source = "@apsi//:all",
out_include_dir = "include/APSI-0.11",
out_static_libs = ["libapsi-0.11.a"],
deps = [
- "@com_github_facebook_zstd//:zstd",
"@com_github_log4cplus_log4cplus//:log4cplus",
- "@com_github_microsoft_FourQlib//:FourQlib",
"@com_github_microsoft_gsl//:Microsoft.GSL",
- "@com_github_microsoft_kuku//:kuku",
- "@com_github_microsoft_seal//:seal",
"@com_github_open_source_parsers_jsoncpp//:jsoncpp",
"@com_github_zeromq_cppzmq//:cppzmq",
"@com_google_flatbuffers//:FlatBuffers",
+ "@fourqlib//:FourQlib",
+ "@kuku",
+ "@seal",
"@zlib",
+ "@zstd",
],
)
diff --git a/bazel/microsoft_kuku.BUILD b/bazel/microsoft_kuku.BUILD
index 3c4c5a1..1a8f930 100644
--- a/bazel/microsoft_kuku.BUILD
+++ b/bazel/microsoft_kuku.BUILD
@@ -30,7 +30,7 @@ cmake(
"CMAKE_INSTALL_LIBDIR": "lib",
},
generate_args = ["-GNinja"],
- lib_source = "@com_github_microsoft_kuku//:all",
+ lib_source = "@kuku//:all",
out_include_dir = "include/Kuku-2.1",
out_static_libs = ["libkuku-2.1.a"],
deps = ["@com_github_microsoft_gsl//:Microsoft.GSL"],
diff --git a/bazel/patches/apsi.patch b/bazel/patches/apsi.patch
index 20cb990..176c6c6 100644
--- a/bazel/patches/apsi.patch
+++ b/bazel/patches/apsi.patch
@@ -22,11 +22,11 @@ index e683045..067d244 100644
@@ -30,7 +30,7 @@ namespace apsi {
return item_count_;
}
-
+
- private:
+ // private:
IndexTranslationTable() = default;
-
+
std::unordered_map table_idx_to_item_idx_;
diff --git a/receiver/apsi/CMakeLists.txt b/receiver/apsi/CMakeLists.txt
index afce298..1790b30 100644
@@ -45,12 +45,12 @@ index 850ac47..aef52a4 100644
--- a/common/apsi/network/sender_operation.cpp
+++ b/common/apsi/network/sender_operation.cpp
@@ -135,7 +135,7 @@ namespace apsi {
-
+
auto oprf_data = fbs_builder.CreateVector(
reinterpret_cast(data.data()), data.size());
- auto req = fbs::CreateOPRFRequest(fbs_builder, oprf_data);
+ auto req = fbs::CreateOPRFRequest(fbs_builder, oprf_data, bucket_idx);
-
+
fbs::SenderOperationBuilder sop_builder(fbs_builder);
sop_builder.add_request_type(fbs::Request_OPRFRequest);
@@ -180,6 +180,7 @@ namespace apsi {
@@ -58,7 +58,7 @@ index 850ac47..aef52a4 100644
data.resize(oprf_data.size());
copy_bytes(oprf_data.data(), oprf_data.size(), data.data());
+ bucket_idx = sop->request_as_OPRFRequest()->bucket_idx();
-
+
return in_data.size();
}
@@ -231,7 +232,8 @@ namespace apsi {
@@ -68,13 +68,13 @@ index 850ac47..aef52a4 100644
- query_request_parts);
+ query_request_parts,
+ bucket_idx);
-
+
fbs::SenderOperationBuilder sop_builder(fbs_builder);
sop_builder.add_request_type(fbs::Request_QueryRequest);
@@ -346,6 +348,8 @@ namespace apsi {
data.emplace(exponent, move(cts_vec));
}
-
+
+ bucket_idx = req.bucket_idx();
+
return in_data.size();
@@ -91,7 +91,7 @@ index a9cc4df..ce3769c 100644
+
+ std::uint32_t bucket_idx = 0;
}; // class SenderOperationOPRF
-
+
/**
@@ -140,6 +142,8 @@ namespace apsi {
ciphertext and the vector holds the ciphertext data for different bundle indices.
@@ -108,12 +108,12 @@ index 4c4e116..8eb34fc 100644
--- a/common/apsi/network/sop.fbs
+++ b/common/apsi/network/sop.fbs
@@ -10,6 +10,7 @@ table ParmsRequest {
-
+
table OPRFRequest {
data:[ubyte] (required);
+ bucket_idx:uint32;
}
-
+
table QueryRequestPart {
@@ -21,6 +22,7 @@ table QueryRequest {
compression_type:ubyte;
@@ -121,5 +121,5 @@ index 4c4e116..8eb34fc 100644
query:[QueryRequestPart] (required);
+ bucket_idx:uint32;
}
-
+
union Request { ParmsRequest, OPRFRequest, QueryRequest }
diff --git a/bazel/patches/boost.patch b/bazel/patches/boost.patch
deleted file mode 100644
index 6772b61..0000000
--- a/bazel/patches/boost.patch
+++ /dev/null
@@ -1,42 +0,0 @@
-diff --git a/config.lzma-linux.h b/config.lzma-linux.h
-index e8b00d8..092696f 100644
---- a/config.lzma-linux.h
-+++ b/config.lzma-linux.h
-@@ -56,7 +56,9 @@
- /* #undef HAVE_COMMONCRYPTO_COMMONDIGEST_H */
-
- /* Define to 1 if you have the header file. */
--#define HAVE_CPUID_H 1
-+#ifdef __x86_64__
-+ #define HAVE_CPUID_H 1
-+#endif
-
- /* Define if the GNU dcgettext() function is already present or preinstalled.
- */
-@@ -309,7 +311,9 @@
-
- /* Define to 1 if _mm_clmulepi64_si128 is usable. See configure.ac for
- details. */
-+#ifdef __x86_64__
- #define HAVE_USABLE_CLMUL 1
-+#endif
-
- /* Define to 1 if you have the `utime' function. */
- /* #undef HAVE_UTIME */
-diff --git a/boost/boost.bzl b/boost/boost.bzl
-index 8277dbb..afc9569 100644
---- a/boost/boost.bzl
-+++ b/boost/boost.bzl
-@@ -139,9 +139,9 @@ def boost_deps():
- http_archive,
- name = "org_lzma_lzma",
- build_file = "@com_github_nelhage_rules_boost//:lzma.BUILD",
-- url = "https://github.com/tukaani-project/xz/releases/download/v5.4.4/xz-5.4.4.tar.gz",
-- sha256 = "aae39544e254cfd27e942d35a048d592959bd7a79f9a624afb0498bb5613bdf8",
-- strip_prefix = "xz-5.4.4",
-+ url = "https://src.fedoraproject.org/lookaside/extras/xz/xz-5.4.6.tar.gz/sha512/b08a61d8d478d3b4675cb1ddacdbbd98dc6941a55bcdd81a28679e54e9367d3a595fa123ac97874a17da571c1b712e2a3e901c2737099a9d268616a1ba3de497/xz-5.4.6.tar.gz",
-+ sha256 = "aeba3e03bf8140ddedf62a0a367158340520f6b384f75ca6045ccc6c0d43fd5c",
-+ strip_prefix = "xz-5.4.6",
- )
-
- maybe(
\ No newline at end of file
diff --git a/bazel/patches/emp-tool-cmake.patch b/bazel/patches/emp-tool-cmake.patch
deleted file mode 100644
index 01aa13d..0000000
--- a/bazel/patches/emp-tool-cmake.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index d9abb31..4c2c171 100755
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -56,11 +56,14 @@ find_package(OpenSSL REQUIRED)
- include_directories(${OPENSSL_INCLUDE_DIR})
-
-
--add_library(${NAME} SHARED ${sources})
-+add_library(${NAME} STATIC ${sources})
-
- install(DIRECTORY emp-tool DESTINATION include/)
- install(DIRECTORY cmake/ DESTINATION cmake/)
- install(TARGETS ${NAME} DESTINATION lib)
-
--ENABLE_TESTING()
--ADD_SUBDIRECTORY(test)
-+option(ENABLE_TESTS "Enable tests" OFF)
-+if (${ENABLE_TESTS})
-+ ENABLE_TESTING()
-+ ADD_SUBDIRECTORY(test)
-+endif()
diff --git a/bazel/patches/emp-tool-sse2neon.patch b/bazel/patches/emp-tool-sse2neon.patch
deleted file mode 100644
index e94b22e..0000000
--- a/bazel/patches/emp-tool-sse2neon.patch
+++ /dev/null
@@ -1,6507 +0,0 @@
-diff --git a/emp-tool/utils/sse2neon.h b/emp-tool/utils/sse2neon.h
-index d09b9c7..efa63a4 100644
---- a/emp-tool/utils/sse2neon.h
-+++ b/emp-tool/utils/sse2neon.h
-@@ -113,7 +113,7 @@
- #ifdef _MSC_VER
- #include
- #if (defined(_M_AMD64) || defined(__x86_64__)) || \
-- (defined(_M_ARM) || defined(__arm__))
-+ (defined(_M_ARM64) || defined(__arm64__))
- #define SSE2NEON_HAS_BITSCAN64
- #endif
- #endif
-@@ -441,7 +441,7 @@ typedef int64x2_t __m128i; /* 128-bit vector containing integers */
- // by applications which attempt to access the contents of an __m128 struct
- // directly. It is important to note that accessing the __m128 struct directly
- // is bad coding practice by Microsoft: @see:
--// https://docs.microsoft.com/en-us/cpp/cpp/m128
-+// https://learn.microsoft.com/en-us/cpp/cpp/m128
- //
- // However, some legacy source code may try to access the contents of an __m128
- // struct directly so the developer can use the SIMDVec as an alias for it. Any
-@@ -621,47 +621,6 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
- * 4, 5, 12, 13, 6, 7, 14, 15);
- * // Shuffle packed 8-bit integers
- * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
-- *
-- * Data (Number, Binary, Byte Index):
-- +------+------+-------------+------+------+-------------+
-- | 1 | 2 | 3 | 4 | Number
-- +------+------+------+------+------+------+------+------+
-- | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
-- +------+------+------+------+------+------+------+------+
-- | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index
-- +------+------+------+------+------+------+------+------+
--
-- +------+------+------+------+------+------+------+------+
-- | 5 | 6 | 7 | 8 | Number
-- +------+------+------+------+------+------+------+------+
-- | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
-- +------+------+------+------+------+------+------+------+
-- | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index
-- +------+------+------+------+------+------+------+------+
-- * Index (Byte Index):
-- +------+------+------+------+------+------+------+------+
-- | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |
-- +------+------+------+------+------+------+------+------+
--
-- +------+------+------+------+------+------+------+------+
-- | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |
-- +------+------+------+------+------+------+------+------+
-- * Result:
-- +------+------+------+------+------+------+------+------+
-- | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index
-- +------+------+------+------+------+------+------+------+
-- | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
-- +------+------+------+------+------+------+------+------+
-- | 256 | 2 | 5 | 6 | Number
-- +------+------+------+------+------+------+------+------+
--
-- +------+------+------+------+------+------+------+------+
-- | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index
-- +------+------+------+------+------+------+------+------+
-- | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
-- +------+------+------+------+------+------+------+------+
-- | 3 | 7 | 4 | 8 | Number
-- +------+------+------+------+------+------+-------------+
- */
-
- /* Constants for use with _mm_prefetch. */
-@@ -1069,9 +1028,9 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
- })
- #endif
-
--// NEON does not support a general purpose permute intrinsic
--// Selects four specific single-precision, floating-point values from a and b,
--// based on the mask i.
-+// NEON does not support a general purpose permute intrinsic.
-+// Shuffle single-precision (32-bit) floating-point elements in a using the
-+// control in imm8, and store the results in dst.
- //
- // C equivalent:
- // __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
-@@ -1082,7 +1041,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
- // return ret;
- // }
- //
--// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
- #define _mm_shuffle_ps_default(a, b, imm) \
- __extension__({ \
- float32x4_t ret; \
-@@ -1100,12 +1059,10 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
- vreinterpretq_m128_f32(ret); \
- })
-
--// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
--// by imm.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
--// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
--// __constrange(0,255) int
--// imm)
-+// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
-+// Store the results in the low 64 bits of dst, with the high 64 bits being
-+// copied from from a to dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
- #define _mm_shufflelo_epi16_function(a, imm) \
- __extension__({ \
- int16x8_t ret = vreinterpretq_s16_m128i(a); \
-@@ -1120,12 +1077,10 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
- vreinterpretq_m128i_s16(ret); \
- })
-
--// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
--// by imm.
--// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
--// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
--// __constrange(0,255) int
--// imm)
-+// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
-+// Store the results in the high 64 bits of dst, with the low 64 bits being
-+// copied from from a to dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
- #define _mm_shufflehi_epi16_function(a, imm) \
- __extension__({ \
- int16x8_t ret = vreinterpretq_s16_m128i(a); \
-@@ -1147,22 +1102,19 @@ FORCE_INLINE void _mm_empty(void) {}
-
- /* SSE */
-
--// Adds the four single-precision, floating-point values of a and b.
--//
--// r0 := a0 + b0
--// r1 := a1 + b1
--// r2 := a2 + b2
--// r3 := a3 + b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
-+// Add packed single-precision (32-bit) floating-point elements in a and b, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
- FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_f32(
- vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
-
--// adds the scalar single-precision floating point values of a and b.
--// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
-+// Add the lower single-precision (32-bit) floating-point element in a and b,
-+// store the result in the lower element of dst, and copy the upper 3 packed
-+// elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
- FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
- {
- float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-@@ -1171,30 +1123,18 @@ FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
- return vreinterpretq_m128_f32(vaddq_f32(a, value));
- }
-
--// Computes the bitwise AND of the four single-precision, floating-point values
--// of a and b.
--//
--// r0 := a0 & b0
--// r1 := a1 & b1
--// r2 := a2 & b2
--// r3 := a3 & b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
-+// Compute the bitwise AND of packed single-precision (32-bit) floating-point
-+// elements in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
- FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_s32(
- vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
- }
-
--// Computes the bitwise AND-NOT of the four single-precision, floating-point
--// values of a and b.
--//
--// r0 := ~a0 & b0
--// r1 := ~a1 & b1
--// r2 := ~a2 & b2
--// r3 := ~a3 & b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
-+// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
-+// elements in a and then AND with b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
- FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_s32(
-@@ -1204,13 +1144,7 @@ FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
-
- // Average packed unsigned 16-bit integers in a and b, and store the results in
- // dst.
--//
--// FOR j := 0 to 3
--// i := j*16
--// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
- FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
- {
- return vreinterpret_m64_u16(
-@@ -1219,186 +1153,199 @@ FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
-
- // Average packed unsigned 8-bit integers in a and b, and store the results in
- // dst.
--//
--// FOR j := 0 to 7
--// i := j*8
--// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
- FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
- {
- return vreinterpret_m64_u8(
- vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
- }
-
--// Compares for equality.
--// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for equality, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
- FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_u32(
- vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
-
--// Compares for equality.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for equality, store the result in the lower element of dst, and copy the
-+// upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
- FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
- }
-
--// Compares for greater than or equal.
--// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for greater-than-or-equal, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
- FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_u32(
- vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
-
--// Compares for greater than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for greater-than-or-equal, store the result in the lower element of dst,
-+// and copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
- FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_cmpge_ps(a, b));
- }
-
--// Compares for greater than.
--//
--// r0 := (a0 > b0) ? 0xffffffff : 0x0
--// r1 := (a1 > b1) ? 0xffffffff : 0x0
--// r2 := (a2 > b2) ? 0xffffffff : 0x0
--// r3 := (a3 > b3) ? 0xffffffff : 0x0
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for greater-than, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
- FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_u32(
- vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
-
--// Compares for greater than.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for greater-than, store the result in the lower element of dst, and copy
-+// the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
- FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
- }
-
--// Compares for less than or equal.
--//
--// r0 := (a0 <= b0) ? 0xffffffff : 0x0
--// r1 := (a1 <= b1) ? 0xffffffff : 0x0
--// r2 := (a2 <= b2) ? 0xffffffff : 0x0
--// r3 := (a3 <= b3) ? 0xffffffff : 0x0
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for less-than-or-equal, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
- FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_u32(
- vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
-
--// Compares for less than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for less-than-or-equal, store the result in the lower element of dst, and
-+// copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
- FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_cmple_ps(a, b));
- }
-
--// Compares for less than
--// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for less-than, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
- FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_u32(
- vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
-
--// Compares for less than
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for less-than, store the result in the lower element of dst, and copy the
-+// upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
- FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_cmplt_ps(a, b));
- }
-
--// Compares for inequality.
--// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for not-equal, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
- FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_u32(vmvnq_u32(
- vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
- }
-
--// Compares for inequality.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for not-equal, store the result in the lower element of dst, and copy the
-+// upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
- FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
- }
-
--// Compares for not greater than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for not-greater-than-or-equal, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
- FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_u32(vmvnq_u32(
- vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
- }
-
--// Compares for not greater than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for not-greater-than-or-equal, store the result in the lower element of
-+// dst, and copy the upper 3 packed elements from a to the upper elements of
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
- FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
- }
-
--// Compares for not greater than.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for not-greater-than, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
- FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_u32(vmvnq_u32(
- vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
- }
-
--// Compares for not greater than.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for not-greater-than, store the result in the lower element of dst, and
-+// copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
- FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
- }
-
--// Compares for not less than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for not-less-than-or-equal, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
- FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_u32(vmvnq_u32(
- vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
- }
-
--// Compares for not less than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for not-less-than-or-equal, store the result in the lower element of dst,
-+// and copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
- FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
- }
-
--// Compares for not less than.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for not-less-than, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
- FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_u32(vmvnq_u32(
- vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
- }
-
--// Compares for not less than.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for not-less-than, store the result in the lower element of dst, and copy
-+// the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
- FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
- }
-
--// Compares the four 32-bit floats in a and b to check if any values are NaN.
--// Ordered compare between each value returns true for "orderable" and false for
--// "not orderable" (NaN).
--// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
--// also:
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// to see if neither is NaN, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
-+//
-+// See also:
- // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
- // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
- FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
-@@ -1413,15 +1360,18 @@ FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
- return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
- }
-
--// Compares for ordered.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b to see if neither is NaN, store the result in the lower element of dst, and
-+// copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
- FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_cmpord_ps(a, b));
- }
-
--// Compares for unordered.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// to see if either is NaN, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
- FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
- {
- uint32x4_t f32a =
-@@ -1431,16 +1381,18 @@ FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
- return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
- }
-
--// Compares for unordered.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b to see if either is NaN, store the result in the lower element of dst, and
-+// copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
- FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
- }
-
--// Compares the lower single-precision floating point scalar values of a and b
--// using an equality operation. :
--// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for equality, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
- FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
- {
- uint32x4_t a_eq_b =
-@@ -1448,9 +1400,9 @@ FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
- return vgetq_lane_u32(a_eq_b, 0) & 0x1;
- }
-
--// Compares the lower single-precision floating point scalar values of a and b
--// using a greater than or equal operation. :
--// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for greater-than-or-equal, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
- FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
- {
- uint32x4_t a_ge_b =
-@@ -1458,9 +1410,9 @@ FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
- return vgetq_lane_u32(a_ge_b, 0) & 0x1;
- }
-
--// Compares the lower single-precision floating point scalar values of a and b
--// using a greater than operation. :
--// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for greater-than, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
- FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
- {
- uint32x4_t a_gt_b =
-@@ -1468,9 +1420,9 @@ FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
- return vgetq_lane_u32(a_gt_b, 0) & 0x1;
- }
-
--// Compares the lower single-precision floating point scalar values of a and b
--// using a less than or equal operation. :
--// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for less-than-or-equal, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
- FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
- {
- uint32x4_t a_le_b =
-@@ -1478,11 +1430,9 @@ FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
- return vgetq_lane_u32(a_le_b, 0) & 0x1;
- }
-
--// Compares the lower single-precision floating point scalar values of a and b
--// using a less than operation. :
--// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
--// note!! The documentation on MSDN is incorrect! If either of the values is a
--// NAN the docs say you will get a one, but in fact, it will return a zero!!
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for less-than, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
- FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
- {
- uint32x4_t a_lt_b =
-@@ -1490,9 +1440,9 @@ FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
- return vgetq_lane_u32(a_lt_b, 0) & 0x1;
- }
-
--// Compares the lower single-precision floating point scalar values of a and b
--// using an inequality operation. :
--// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for not-equal, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
- FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
- {
- return !_mm_comieq_ss(a, b);
-@@ -1502,13 +1452,7 @@ FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
- // (32-bit) floating-point elements, store the results in the lower 2 elements
- // of dst, and copy the upper 2 packed elements from a to the upper elements of
- // dst.
--//
--// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
--// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
--// dst[95:64] := a[95:64]
--// dst[127:96] := a[127:96]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
- FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
- {
- return vreinterpretq_m128_f32(
-@@ -1518,13 +1462,7 @@ FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
-
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed 32-bit integers, and store the results in dst.
--//
--// FOR j := 0 to 1
--// i := 32*j
--// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
- FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
- {
- #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-@@ -1539,11 +1477,7 @@ FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
- // Convert the signed 32-bit integer b to a single-precision (32-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper 3 packed elements from a to the upper elements of dst.
--//
--// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
--// dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
- FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
- {
- return vreinterpretq_m128_f32(
-@@ -1552,7 +1486,7 @@ FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
-
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 32-bit integer, and store the result in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
- FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
- {
- #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-@@ -1567,14 +1501,7 @@ FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
-
- // Convert packed 16-bit integers in a to packed single-precision (32-bit)
- // floating-point elements, and store the results in dst.
--//
--// FOR j := 0 to 3
--// i := j*16
--// m := j*32
--// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
- FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
- {
- return vreinterpretq_m128_f32(
-@@ -1584,13 +1511,7 @@ FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
- // Convert packed 32-bit integers in b to packed single-precision (32-bit)
- // floating-point elements, store the results in the lower 2 elements of dst,
- // and copy the upper 2 packed elements from a to the upper elements of dst.
--//
--// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
--// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
--// dst[95:64] := a[95:64]
--// dst[127:96] := a[127:96]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
- FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
- {
- return vreinterpretq_m128_f32(
-@@ -1603,13 +1524,7 @@ FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
- // of dst, then convert the packed signed 32-bit integers in b to
- // single-precision (32-bit) floating-point element, and store the results in
- // the upper 2 elements of dst.
--//
--// dst[31:0] := Convert_Int32_To_FP32(a[31:0])
--// dst[63:32] := Convert_Int32_To_FP32(a[63:32])
--// dst[95:64] := Convert_Int32_To_FP32(b[31:0])
--// dst[127:96] := Convert_Int32_To_FP32(b[63:32])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
- FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
- {
- return vreinterpretq_m128_f32(vcvtq_f32_s32(
-@@ -1618,14 +1533,7 @@ FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
-
- // Convert the lower packed 8-bit integers in a to packed single-precision
- // (32-bit) floating-point elements, and store the results in dst.
--//
--// FOR j := 0 to 3
--// i := j*8
--// m := j*32
--// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
- FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
- {
- return vreinterpretq_m128_f32(vcvtq_f32_s32(
-@@ -1636,18 +1544,7 @@ FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
- // packed 16-bit integers, and store the results in dst. Note: this intrinsic
- // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
- // 0x7FFFFFFF.
--//
--// FOR j := 0 to 3
--// i := 16*j
--// k := 32*j
--// IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)
--// dst[i+15:i] := 0x7FFF
--// ELSE
--// dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
- FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
- {
- return vreinterpret_m64_s16(
-@@ -1656,31 +1553,14 @@ FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
-
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed 32-bit integers, and store the results in dst.
--//
--// FOR j := 0 to 1
--// i := 32*j
--// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
- #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
-
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed 8-bit integers, and store the results in lower 4 elements of dst.
- // Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
- // between 0x7F and 0x7FFFFFFF.
--//
--// FOR j := 0 to 3
--// i := 8*j
--// k := 32*j
--// IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)
--// dst[i+7:i] := 0x7F
--// ELSE
--// dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
- FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
- {
- return vreinterpret_m64_s8(vqmovn_s16(
-@@ -1689,14 +1569,7 @@ FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
-
- // Convert packed unsigned 16-bit integers in a to packed single-precision
- // (32-bit) floating-point elements, and store the results in dst.
--//
--// FOR j := 0 to 3
--// i := j*16
--// m := j*32
--// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
- FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
- {
- return vreinterpretq_m128_f32(
-@@ -1706,14 +1579,7 @@ FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
- // Convert the lower packed unsigned 8-bit integers in a to packed
- // single-precision (32-bit) floating-point elements, and store the results in
- // dst.
--//
--// FOR j := 0 to 3
--// i := j*8
--// m := j*32
--// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
- FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
- {
- return vreinterpretq_m128_f32(vcvtq_f32_u32(
-@@ -1723,21 +1589,13 @@ FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
- // Convert the signed 32-bit integer b to a single-precision (32-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper 3 packed elements from a to the upper elements of dst.
--//
--// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
--// dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
- #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
-
- // Convert the signed 64-bit integer b to a single-precision (32-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper 3 packed elements from a to the upper elements of dst.
--//
--// dst[31:0] := Convert_Int64_To_FP32(b[63:0])
--// dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
- FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
- {
- return vreinterpretq_m128_f32(
-@@ -1745,10 +1603,7 @@ FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
- }
-
- // Copy the lower single-precision (32-bit) floating-point element of a to dst.
--//
--// dst[31:0] := a[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
- FORCE_INLINE float _mm_cvtss_f32(__m128 a)
- {
- return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-@@ -1756,18 +1611,12 @@ FORCE_INLINE float _mm_cvtss_f32(__m128 a)
-
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 32-bit integer, and store the result in dst.
--//
--// dst[31:0] := Convert_FP32_To_Int32(a[31:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
- #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
-
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 64-bit integer, and store the result in dst.
--//
--// dst[63:0] := Convert_FP32_To_Int64(a[31:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
- FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
- {
- #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-@@ -1781,13 +1630,7 @@ FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
-
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed 32-bit integers with truncation, and store the results in dst.
--//
--// FOR j := 0 to 1
--// i := 32*j
--// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
- FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
- {
- return vreinterpret_m64_s32(
-@@ -1796,10 +1639,7 @@ FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
-
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 32-bit integer with truncation, and store the result in dst.
--//
--// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
- FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
- {
- return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
-@@ -1807,60 +1647,49 @@ FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
-
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed 32-bit integers with truncation, and store the results in dst.
--//
--// FOR j := 0 to 1
--// i := 32*j
--// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
- #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
-
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 32-bit integer with truncation, and store the result in dst.
--//
--// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
- #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
-
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 64-bit integer with truncation, and store the result in dst.
--//
--// dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
- FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
- {
- return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
- }
-
--// Divides the four single-precision, floating-point values of a and b.
--//
--// r0 := a0 / b0
--// r1 := a1 / b1
--// r2 := a2 / b2
--// r3 := a3 / b3
--//
--// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
-+// Divide packed single-precision (32-bit) floating-point elements in a by
-+// packed elements in b, and store the results in dst.
-+// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
-+// division by multiplying a by b's reciprocal before using the Newton-Raphson
-+// method to approximate the results.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
- FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
- {
--#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
-+#if defined(__aarch64__)
- return vreinterpretq_m128_f32(
- vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- #else
- float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
- recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
--#if SSE2NEON_PRECISE_DIV
- // Additional Netwon-Raphson iteration for accuracy
- recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
--#endif
- return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
- #endif
- }
-
--// Divides the scalar single-precision floating point value of a by b.
--// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
-+// Divide the lower single-precision (32-bit) floating-point element in a by the
-+// lower single-precision (32-bit) floating-point element in b, store the result
-+// in the lower element of dst, and copy the upper 3 packed elements from a to
-+// the upper elements of dst.
-+// Warning: ARMv7-A does not produce the same result compared to Intel and not
-+// IEEE-compliant.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
- FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
- {
- float32_t value =
-@@ -1871,12 +1700,12 @@ FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
-
- // Extract a 16-bit integer from a, selected with imm8, and store the result in
- // the lower element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
- #define _mm_extract_pi16(a, imm) \
- (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
-
- // Free aligned memory that was allocated with _mm_malloc.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
- #if !defined(SSE2NEON_ALLOC_DEFINED)
- FORCE_INLINE void _mm_free(void *addr)
- {
-@@ -1887,7 +1716,7 @@ FORCE_INLINE void _mm_free(void *addr)
- // Macro: Get the flush zero bits from the MXCSR control and status register.
- // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
- // _MM_FLUSH_ZERO_OFF
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
- FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
- {
- union {
-@@ -1911,7 +1740,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
- // Macro: Get the rounding mode bits from the MXCSR control and status register.
- // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
- // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
- FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
- {
- union {
-@@ -1938,15 +1767,17 @@ FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
-
- // Copy a to dst, and insert the 16-bit integer i into dst at the location
- // specified by imm8.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
- #define _mm_insert_pi16(a, b, imm) \
- __extension__({ \
- vreinterpret_m64_s16( \
- vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
- })
-
--// Loads four single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
-+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
-+// boundary or a general-protection exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
- FORCE_INLINE __m128 _mm_load_ps(const float *p)
- {
- return vreinterpretq_m128_f32(vld1q_f32(p));
-@@ -1960,52 +1791,40 @@ FORCE_INLINE __m128 _mm_load_ps(const float *p)
- // dst[95:64] := MEM[mem_addr+31:mem_addr]
- // dst[127:96] := MEM[mem_addr+31:mem_addr]
- //
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
- #define _mm_load_ps1 _mm_load1_ps
-
--// Loads an single - precision, floating - point value into the low word and
--// clears the upper three words.
--// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
-+// Load a single-precision (32-bit) floating-point element from memory into the
-+// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
-+// aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
- FORCE_INLINE __m128 _mm_load_ss(const float *p)
- {
- return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
- }
-
--// Loads a single single-precision, floating-point value, copying it into all
--// four words
--// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
-+// Load a single-precision (32-bit) floating-point element from memory into all
-+// elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
- FORCE_INLINE __m128 _mm_load1_ps(const float *p)
- {
- return vreinterpretq_m128_f32(vld1q_dup_f32(p));
- }
-
--// Sets the upper two single-precision, floating-point values with 64
--// bits of data loaded from the address p; the lower two values are passed
--// through from a.
--//
--// r0 := a0
--// r1 := a1
--// r2 := *p0
--// r3 := *p1
--//
--// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
-+// Load 2 single-precision (32-bit) floating-point elements from memory into the
-+// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
-+// mem_addr does not need to be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
- FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
- {
- return vreinterpretq_m128_f32(
- vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
- }
-
--// Sets the lower two single-precision, floating-point values with 64
--// bits of data loaded from the address p; the upper two values are passed
--// through from a.
--//
--// Return Value
--// r0 := *p0
--// r1 := *p1
--// r2 := a2
--// r3 := a3
--//
--// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
-+// Load 2 single-precision (32-bit) floating-point elements from memory into the
-+// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
-+// mem_addr does not need to be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
- FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
- {
- return vreinterpretq_m128_f32(
-@@ -2015,21 +1834,17 @@ FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
- // Load 4 single-precision (32-bit) floating-point elements from memory into dst
- // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
- // general-protection exception may be generated.
--//
--// dst[31:0] := MEM[mem_addr+127:mem_addr+96]
--// dst[63:32] := MEM[mem_addr+95:mem_addr+64]
--// dst[95:64] := MEM[mem_addr+63:mem_addr+32]
--// dst[127:96] := MEM[mem_addr+31:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
- FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
- {
- float32x4_t v = vrev64q_f32(vld1q_f32(p));
- return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
- }
-
--// Loads four single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
-+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-+// elements) from memory into dst. mem_addr does not need to be aligned on any
-+// particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
- FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
- {
- // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
-@@ -2038,11 +1853,7 @@ FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
- }
-
- // Load unaligned 16-bit integer from memory into the first element of dst.
--//
--// dst[15:0] := MEM[mem_addr+15:mem_addr]
--// dst[MAX:16] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
- FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
- {
- return vreinterpretq_m128i_s16(
-@@ -2050,20 +1861,17 @@ FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
- }
-
- // Load unaligned 64-bit integer from memory into the first element of dst.
--//
--// dst[63:0] := MEM[mem_addr+63:mem_addr]
--// dst[MAX:64] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
- FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
- {
- return vreinterpretq_m128i_s64(
- vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
- }
-
--// Allocate aligned blocks of memory.
--// https://software.intel.com/en-us/
--// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
-+// Allocate size bytes of memory, aligned to the alignment specified in align,
-+// and return a pointer to the allocated memory. _mm_free should be used to free
-+// memory that is allocated with _mm_malloc.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
- #if !defined(SSE2NEON_ALLOC_DEFINED)
- FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
- {
-@@ -2081,7 +1889,7 @@ FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
- // Conditionally store 8-bit integer elements from a into memory using mask
- // (elements are not stored when the highest bit is not set in the corresponding
- // element) and a non-temporal memory hint.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
- FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
- {
- int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
-@@ -2095,27 +1903,23 @@ FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
- // Conditionally store 8-bit integer elements from a into memory using mask
- // (elements are not stored when the highest bit is not set in the corresponding
- // element) and a non-temporal memory hint.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
- #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
-
- // Compare packed signed 16-bit integers in a and b, and store packed maximum
- // values in dst.
--//
--// FOR j := 0 to 3
--// i := j*16
--// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
- FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
- {
- return vreinterpret_m64_s16(
- vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
- }
-
--// Computes the maximums of the four single-precision, floating-point values of
--// a and b.
--// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b,
-+// and store packed maximum values in dst. dst does not follow the IEEE Standard
-+// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
-+// signed-zero values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
- FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
- {
- #if SSE2NEON_PRECISE_MINMAX
-@@ -2130,22 +1934,19 @@ FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
-
- // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
- // values in dst.
--//
--// FOR j := 0 to 7
--// i := j*8
--// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
- FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
- {
- return vreinterpret_m64_u8(
- vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
- }
-
--// Computes the maximum of the two lower scalar single-precision floating point
--// values of a and b.
--// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b, store the maximum value in the lower element of dst, and copy the upper 3
-+// packed elements from a to the upper element of dst. dst does not follow the
-+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
-+// inputs are NaN or signed-zero values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
- FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
- {
- float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
-@@ -2155,22 +1956,18 @@ FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
-
- // Compare packed signed 16-bit integers in a and b, and store packed minimum
- // values in dst.
--//
--// FOR j := 0 to 3
--// i := j*16
--// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
- FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
- {
- return vreinterpret_m64_s16(
- vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
- }
-
--// Computes the minima of the four single-precision, floating-point values of a
--// and b.
--// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b,
-+// and store packed minimum values in dst. dst does not follow the IEEE Standard
-+// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
-+// signed-zero values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
- FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
- {
- #if SSE2NEON_PRECISE_MINMAX
-@@ -2185,22 +1982,19 @@ FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
-
- // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
- // values in dst.
--//
--// FOR j := 0 to 7
--// i := j*8
--// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
- FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
- {
- return vreinterpret_m64_u8(
- vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
- }
-
--// Computes the minimum of the two lower scalar single-precision floating point
--// values of a and b.
--// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b, store the minimum value in the lower element of dst, and copy the upper 3
-+// packed elements from a to the upper element of dst. dst does not follow the
-+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
-+// inputs are NaN or signed-zero values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
- FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
- {
- float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
-@@ -2208,8 +2002,10 @@ FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
- vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
- }
-
--// Sets the low word to the single-precision, floating-point value of b
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
-+// Move the lower single-precision (32-bit) floating-point element from b to the
-+// lower element of dst, and copy the upper 3 packed elements from a to the
-+// upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
- FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_f32(
-@@ -2217,25 +2013,26 @@ FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
- vreinterpretq_f32_m128(a), 0));
- }
-
--// Moves the upper two values of B into the lower two values of A.
--//
--// r3 := a3
--// r2 := a2
--// r1 := b3
--// r0 := b2
--FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
--{
-- float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
-- float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
-+// Move the upper 2 single-precision (32-bit) floating-point elements from b to
-+// the lower 2 elements of dst, and copy the upper 2 elements from a to the
-+// upper 2 elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
-+FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
-+{
-+#if defined(aarch64__)
-+ return vreinterpretq_m128_u64(
-+ vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
-+#else
-+ float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-+ float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
- return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
-+#endif
- }
-
--// Moves the lower two values of B into the upper two values of A.
--//
--// r3 := b1
--// r2 := b0
--// r1 := a1
--// r0 := a0
-+// Move the lower 2 single-precision (32-bit) floating-point elements from b to
-+// the upper 2 elements of dst, and copy the lower 2 elements from a to the
-+// lower 2 elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
- FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
- {
- float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
-@@ -2245,7 +2042,7 @@ FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
-
- // Create mask from the most significant bit of each 8-bit element in a, and
- // store the result in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
- FORCE_INLINE int _mm_movemask_pi8(__m64 a)
- {
- uint8x8_t input = vreinterpret_u8_m64(a);
-@@ -2264,10 +2061,9 @@ FORCE_INLINE int _mm_movemask_pi8(__m64 a)
- #endif
- }
-
--// NEON does not provide this method
--// Creates a 4-bit mask from the most significant bits of the four
--// single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
-+// Set each bit of mask dst based on the most significant bit of the
-+// corresponding packed single-precision (32-bit) floating-point element in a.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
- FORCE_INLINE int _mm_movemask_ps(__m128 a)
- {
- uint32x4_t input = vreinterpretq_u32_m128(a);
-@@ -2288,14 +2084,9 @@ FORCE_INLINE int _mm_movemask_ps(__m128 a)
- #endif
- }
-
--// Multiplies the four single-precision, floating-point values of a and b.
--//
--// r0 := a0 * b0
--// r1 := a1 * b1
--// r2 := a2 * b2
--// r3 := a3 * b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
-+// Multiply packed single-precision (32-bit) floating-point elements in a and b,
-+// and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
- FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_f32(
-@@ -2305,11 +2096,7 @@ FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
- // Multiply the lower single-precision (32-bit) floating-point element in a and
- // b, store the result in the lower element of dst, and copy the upper 3 packed
- // elements from a to the upper elements of dst.
--//
--// dst[31:0] := a[31:0] * b[31:0]
--// dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
- FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_mul_ps(a, b));
-@@ -2318,16 +2105,16 @@ FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
- // Multiply the packed unsigned 16-bit integers in a and b, producing
- // intermediate 32-bit integers, and store the high 16 bits of the intermediate
- // integers in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
- FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
- {
- return vreinterpret_m64_u16(vshrn_n_u32(
- vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
- }
-
--// Computes the bitwise OR of the four single-precision, floating-point values
--// of a and b.
--// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
-+// Compute the bitwise OR of packed single-precision (32-bit) floating-point
-+// elements in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
- FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_s32(
-@@ -2336,65 +2123,53 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
-
- // Average packed unsigned 8-bit integers in a and b, and store the results in
- // dst.
--//
--// FOR j := 0 to 7
--// i := j*8
--// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
- #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
-
- // Average packed unsigned 16-bit integers in a and b, and store the results in
- // dst.
--//
--// FOR j := 0 to 3
--// i := j*16
--// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
- #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
-
- // Extract a 16-bit integer from a, selected with imm8, and store the result in
- // the lower element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
- #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
-
- // Copy a to dst, and insert the 16-bit integer i into dst at the location
- // specified by imm8.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
- #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
-
- // Compare packed signed 16-bit integers in a and b, and store packed maximum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
- #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
-
- // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
- #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
-
- // Compare packed signed 16-bit integers in a and b, and store packed minimum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
- #define _m_pminsw(a, b) _mm_min_pi16(a, b)
-
- // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
- #define _m_pminub(a, b) _mm_min_pu8(a, b)
-
- // Create mask from the most significant bit of each 8-bit element in a, and
- // store the result in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
- #define _m_pmovmskb(a) _mm_movemask_pi8(a)
-
- // Multiply the packed unsigned 16-bit integers in a and b, producing
- // intermediate 32-bit integers, and store the high 16 bits of the intermediate
- // integers in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
- #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
-
- // Fetch the line of data from memory that contains address p to a location in
-@@ -2422,26 +2197,22 @@ FORCE_INLINE void _mm_prefetch(char const *p, int i)
- // b, then horizontally sum each consecutive 8 differences to produce four
- // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
- // 16 bits of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
- #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
-
- // Shuffle 16-bit integers in a using the control in imm8, and store the results
- // in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
- #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
-
- // Compute the approximate reciprocal of packed single-precision (32-bit)
- // floating-point elements in a, and store the results in dst. The maximum
- // relative error for this approximation is less than 1.5*2^-12.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
- FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
- {
- float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
- recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
--#if SSE2NEON_PRECISE_DIV
-- // Additional Netwon-Raphson iteration for accuracy
-- recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
--#endif
- return vreinterpretq_m128_f32(recip);
- }
-
-@@ -2449,30 +2220,21 @@ FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
- // floating-point element in a, store the result in the lower element of dst,
- // and copy the upper 3 packed elements from a to the upper elements of dst. The
- // maximum relative error for this approximation is less than 1.5*2^-12.
--//
--// dst[31:0] := (1.0 / a[31:0])
--// dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
- FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
- {
- return _mm_move_ss(a, _mm_rcp_ps(a));
- }
-
--// Computes the approximations of the reciprocal square roots of the four
--// single-precision floating point values of in.
--// The current precision is 1% error.
--// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
-+// Compute the approximate reciprocal square root of packed single-precision
-+// (32-bit) floating-point elements in a, and store the results in dst. The
-+// maximum relative error for this approximation is less than 1.5*2^-12.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
- FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
- {
- float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
--#if SSE2NEON_PRECISE_SQRT
-- // Additional Netwon-Raphson iteration for accuracy
- out = vmulq_f32(
- out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
-- out = vmulq_f32(
-- out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
--#endif
- return vreinterpretq_m128_f32(out);
- }
-
-@@ -2480,7 +2242,7 @@ FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
- // (32-bit) floating-point element in a, store the result in the lower element
- // of dst, and copy the upper 3 packed elements from a to the upper elements of
- // dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
- FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
- {
- return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
-@@ -2490,7 +2252,7 @@ FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
- // b, then horizontally sum each consecutive 8 differences to produce four
- // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
- // 16 bits of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
- FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
- {
- uint64x1_t t = vpaddl_u32(vpaddl_u16(
-@@ -2502,7 +2264,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
- // Macro: Set the flush zero bits of the MXCSR control and status register to
- // the value in unsigned 32-bit integer a. The flush zero may contain any of the
- // following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
- FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
- {
- // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
-@@ -2531,16 +2293,18 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
- #endif
- }
-
--// Sets the four single-precision, floating-point values to the four inputs.
--// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
-+// Set packed single-precision (32-bit) floating-point elements in dst with the
-+// supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
- FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
- {
- float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
- return vreinterpretq_m128_f32(vld1q_f32(data));
- }
-
--// Sets the four single-precision, floating-point values to w.
--// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-+// Broadcast single-precision (32-bit) floating-point value a to all elements of
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
- FORCE_INLINE __m128 _mm_set_ps1(float _w)
- {
- return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-@@ -2550,7 +2314,7 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
- // the value in unsigned 32-bit integer a. The rounding mode may contain any of
- // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
- // _MM_ROUND_TOWARD_ZERO
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
- FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
- {
- union {
-@@ -2595,45 +2359,48 @@ FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
-
- // Copy single-precision (32-bit) floating-point element a to the lower element
- // of dst, and zero the upper 3 elements.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
- FORCE_INLINE __m128 _mm_set_ss(float a)
- {
- return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
- }
-
--// Sets the four single-precision, floating-point values to w.
--//
--// r0 := r1 := r2 := r3 := w
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-+// Broadcast single-precision (32-bit) floating-point value a to all elements of
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
- FORCE_INLINE __m128 _mm_set1_ps(float _w)
- {
- return vreinterpretq_m128_f32(vdupq_n_f32(_w));
- }
-
-+// Set the MXCSR control and status register with the value in unsigned 32-bit
-+// integer a.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
- // FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
- FORCE_INLINE void _mm_setcsr(unsigned int a)
- {
- _MM_SET_ROUNDING_MODE(a);
- }
-
-+// Get the unsigned 32-bit value of the MXCSR control and status register.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
- // FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
- FORCE_INLINE unsigned int _mm_getcsr()
- {
- return _MM_GET_ROUNDING_MODE();
- }
-
--// Sets the four single-precision, floating-point values to the four inputs in
--// reverse order.
--// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
-+// Set packed single-precision (32-bit) floating-point elements in dst with the
-+// supplied values in reverse order.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
- FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
- {
- float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
- return vreinterpretq_m128_f32(vld1q_f32(data));
- }
-
--// Clears the four single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
-+// Return vector of type __m128 with all elements set to zero.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
- FORCE_INLINE __m128 _mm_setzero_ps(void)
- {
- return vreinterpretq_m128_f32(vdupq_n_f32(0));
-@@ -2641,7 +2408,7 @@ FORCE_INLINE __m128 _mm_setzero_ps(void)
-
- // Shuffle 16-bit integers in a using the control in imm8, and store the results
- // in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
- #ifdef _sse2neon_shuffle
- #define _mm_shuffle_pi16(a, imm) \
- __extension__({ \
-@@ -2775,19 +2542,17 @@ FORCE_INLINE void _mm_lfence(void)
- })
- #endif
-
--// Computes the approximations of square roots of the four single-precision,
--// floating-point values of a. First computes reciprocal square roots and then
--// reciprocals of the four values.
--//
--// r0 := sqrt(a0)
--// r1 := sqrt(a1)
--// r2 := sqrt(a2)
--// r3 := sqrt(a3)
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
-+// Compute the square root of packed single-precision (32-bit) floating-point
-+// elements in a, and store the results in dst.
-+// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement
-+// square root by multiplying input in with its reciprocal square root before
-+// using the Newton-Raphson method to approximate the results.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
- FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
- {
--#if SSE2NEON_PRECISE_SQRT
-+#if defined(__aarch64__)
-+ return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
-+#else
- float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-
- // Test for vrsqrteq_f32(0) -> positive infinity case.
-@@ -2798,28 +2563,23 @@ FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
- recip = vreinterpretq_f32_u32(
- vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
-
-- // Additional Netwon-Raphson iteration for accuracy
- recip = vmulq_f32(
- vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
- recip);
-+ // Additional Netwon-Raphson iteration for accuracy
- recip = vmulq_f32(
- vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
- recip);
-
- // sqrt(s) = s * 1/sqrt(s)
- return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
--#elif defined(__aarch64__)
-- return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
--#else
-- float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-- float32x4_t sq = vrecpeq_f32(recipsq);
-- return vreinterpretq_m128_f32(sq);
- #endif
- }
-
--// Computes the approximation of the square root of the scalar single-precision
--// floating point value of in.
--// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
-+// Compute the square root of the lower single-precision (32-bit) floating-point
-+// element in a, store the result in the lower element of dst, and copy the
-+// upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
- FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
- {
- float32_t value =
-@@ -2828,8 +2588,10 @@ FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
- vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
- }
-
--// Stores four single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
-+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
-+// or a general-protection exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
- FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
- {
- vst1q_f32(p, vreinterpretq_f32_m128(a));
-@@ -2838,21 +2600,16 @@ FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
- // Store the lower single-precision (32-bit) floating-point element from a into
- // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
- // boundary or a general-protection exception may be generated.
--//
--// MEM[mem_addr+31:mem_addr] := a[31:0]
--// MEM[mem_addr+63:mem_addr+32] := a[31:0]
--// MEM[mem_addr+95:mem_addr+64] := a[31:0]
--// MEM[mem_addr+127:mem_addr+96] := a[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
- FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
- {
- float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
- vst1q_f32(p, vdupq_n_f32(a0));
- }
-
--// Stores the lower single - precision, floating - point value.
--// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
-+// Store the lower single-precision (32-bit) floating-point element from a into
-+// memory. mem_addr does not need to be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
- FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
- {
- vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
-@@ -2861,34 +2618,20 @@ FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
- // Store the lower single-precision (32-bit) floating-point element from a into
- // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
- // boundary or a general-protection exception may be generated.
--//
--// MEM[mem_addr+31:mem_addr] := a[31:0]
--// MEM[mem_addr+63:mem_addr+32] := a[31:0]
--// MEM[mem_addr+95:mem_addr+64] := a[31:0]
--// MEM[mem_addr+127:mem_addr+96] := a[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
- #define _mm_store1_ps _mm_store_ps1
-
--// Stores the upper two single-precision, floating-point values of a to the
--// address p.
--//
--// *p0 := a2
--// *p1 := a3
--//
--// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
-+// Store the upper 2 single-precision (32-bit) floating-point elements from a
-+// into memory.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
- FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
- {
- *p = vreinterpret_m64_f32(vget_high_f32(a));
- }
-
--// Stores the lower two single-precision floating point values of a to the
--// address p.
--//
--// *p0 := a0
--// *p1 := a1
--//
--// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
-+// Store the lower 2 single-precision (32-bit) floating-point elements from a
-+// into memory.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
- FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
- {
- *p = vreinterpret_m64_f32(vget_low_f32(a));
-@@ -2897,13 +2640,7 @@ FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
- // Store 4 single-precision (32-bit) floating-point elements from a into memory
- // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
- // general-protection exception may be generated.
--//
--// MEM[mem_addr+31:mem_addr] := a[127:96]
--// MEM[mem_addr+63:mem_addr+32] := a[95:64]
--// MEM[mem_addr+95:mem_addr+64] := a[63:32]
--// MEM[mem_addr+127:mem_addr+96] := a[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
- FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
- {
- float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
-@@ -2911,22 +2648,24 @@ FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
- vst1q_f32(p, rev);
- }
-
--// Stores four single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
-+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-+// elements) from a into memory. mem_addr does not need to be aligned on any
-+// particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
- FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
- {
- vst1q_f32(p, vreinterpretq_f32_m128(a));
- }
-
- // Stores 16-bits of integer data a at the address p.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
- FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
- {
- vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
- }
-
- // Stores 64-bits of integer data a at the address p.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
- FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
- {
- vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
-@@ -2934,7 +2673,7 @@ FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
-
- // Store 64-bits of integer data from a into memory using a non-temporal memory
- // hint.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
- FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
- {
- vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
-@@ -2942,7 +2681,7 @@ FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
-
- // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
- // point elements) from a into memory using a non-temporal memory hint.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
- FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
- {
- #if __has_builtin(__builtin_nontemporal_store)
-@@ -2952,14 +2691,10 @@ FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
- #endif
- }
-
--// Subtracts the four single-precision, floating-point values of a and b.
--//
--// r0 := a0 - b0
--// r1 := a1 - b1
--// r2 := a2 - b2
--// r3 := a3 - b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
-+// Subtract packed single-precision (32-bit) floating-point elements in b from
-+// packed single-precision (32-bit) floating-point elements in a, and store the
-+// results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
- FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_f32(
-@@ -2970,11 +2705,7 @@ FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
- // the lower single-precision (32-bit) floating-point element in a, store the
- // result in the lower element of dst, and copy the upper 3 packed elements from
- // a to the upper elements of dst.
--//
--// dst[31:0] := a[31:0] - b[31:0]
--// dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
- FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_sub_ps(a, b));
-@@ -2983,7 +2714,7 @@ FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
- // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
- // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
- // transposed matrix in these vectors (row0 now contains column 0, etc.).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
- #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
- do { \
- float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
-@@ -3008,7 +2739,7 @@ FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
- #define _mm_ucomineq_ss _mm_comineq_ss
-
- // Return vector of type __m128i with undefined elements.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
- FORCE_INLINE __m128i _mm_undefined_si128(void)
- {
- #if defined(__GNUC__) || defined(__clang__)
-@@ -3023,7 +2754,7 @@ FORCE_INLINE __m128i _mm_undefined_si128(void)
- }
-
- // Return vector of type __m128 with undefined elements.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
- FORCE_INLINE __m128 _mm_undefined_ps(void)
- {
- #if defined(__GNUC__) || defined(__clang__)
-@@ -3037,15 +2768,9 @@ FORCE_INLINE __m128 _mm_undefined_ps(void)
- #endif
- }
-
--// Selects and interleaves the upper two single-precision, floating-point values
--// from a and b.
--//
--// r0 := a2
--// r1 := b2
--// r2 := a3
--// r3 := b3
--//
--// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
-+// Unpack and interleave single-precision (32-bit) floating-point elements from
-+// the high half a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
- FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
- {
- #if defined(__aarch64__)
-@@ -3059,15 +2784,9 @@ FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
- #endif
- }
-
--// Selects and interleaves the lower two single-precision, floating-point values
--// from a and b.
--//
--// r0 := a0
--// r1 := b0
--// r2 := a1
--// r3 := b1
--//
--// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
-+// Unpack and interleave single-precision (32-bit) floating-point elements from
-+// the low half of a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
- FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
- {
- #if defined(__aarch64__)
-@@ -3081,9 +2800,9 @@ FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
- #endif
- }
-
--// Computes bitwise EXOR (exclusive-or) of the four single-precision,
--// floating-point values of a and b.
--// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
-+// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
-+// elements in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
- FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
- {
- return vreinterpretq_m128_s32(
-@@ -3092,42 +2811,32 @@ FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
-
- /* SSE2 */
-
--// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
--// unsigned 16-bit integers in b.
--// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
-+// Add packed 16-bit integers in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
- FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s16(
- vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
-
--// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
--// unsigned 32-bit integers in b.
--//
--// r0 := a0 + b0
--// r1 := a1 + b1
--// r2 := a2 + b2
--// r3 := a3 + b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-+// Add packed 32-bit integers in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
- FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s32(
- vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
-
--// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
--// unsigned 32-bit integers in b.
--// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-+// Add packed 64-bit integers in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
- FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s64(
- vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
- }
-
--// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
--// unsigned 8-bit integers in b.
--// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
-+// Add packed 8-bit integers in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
- FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s8(
-@@ -3136,7 +2845,7 @@ FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
-
- // Add packed double-precision (64-bit) floating-point elements in a and b, and
- // store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
- FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3155,11 +2864,7 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
- // Add the lower double-precision (64-bit) floating-point element in a and b,
- // store the result in the lower element of dst, and copy the upper element from
- // a to the upper element of dst.
--//
--// dst[63:0] := a[63:0] + b[63:0]
--// dst[127:64] := a[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
- FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3175,25 +2880,16 @@ FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
- }
-
- // Add 64-bit integers a and b, and store the result in dst.
--//
--// dst[63:0] := a[63:0] + b[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
- FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
- {
- return vreinterpret_m64_s64(
- vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
- }
-
--// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
--// and saturates.
--//
--// r0 := SignedSaturate(a0 + b0)
--// r1 := SignedSaturate(a1 + b1)
--// ...
--// r7 := SignedSaturate(a7 + b7)
--//
--// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
-+// Add packed signed 16-bit integers in a and b using saturation, and store the
-+// results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
- FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s16(
-@@ -3202,13 +2898,7 @@ FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
-
- // Add packed signed 8-bit integers in a and b using saturation, and store the
- // results in dst.
--//
--// FOR j := 0 to 15
--// i := j*8
--// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
- FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s8(
-@@ -3217,16 +2907,16 @@ FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
-
- // Add packed unsigned 16-bit integers in a and b using saturation, and store
- // the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
- FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u16(
- vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
- }
-
--// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
--// b and saturates..
--// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
-+// Add packed unsigned 8-bit integers in a and b using saturation, and store the
-+// results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
- FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u8(
-@@ -3235,25 +2925,16 @@ FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
-
- // Compute the bitwise AND of packed double-precision (64-bit) floating-point
- // elements in a and b, and store the results in dst.
--//
--// FOR j := 0 to 1
--// i := j*64
--// dst[i+63:i] := a[i+63:i] AND b[i+63:i]
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
- FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
- {
- return vreinterpretq_m128d_s64(
- vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
- }
-
--// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
--// b.
--//
--// r := a & b
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
-+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-+// and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
- FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s32(
-@@ -3262,13 +2943,7 @@ FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
-
- // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
- // elements in a and then AND with b, and store the results in dst.
--//
--// FOR j := 0 to 1
--// i := j*64
--// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
- FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
- {
- // *NOTE* argument swap
-@@ -3276,12 +2951,9 @@ FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
- vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
- }
-
--// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
--// 128-bit value in a.
--//
--// r := (~a) & b
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
-+// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
-+// AND with b, and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
- FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s32(
-@@ -3289,30 +2961,18 @@ FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
- vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
- }
-
--// Computes the average of the 8 unsigned 16-bit integers in a and the 8
--// unsigned 16-bit integers in b and rounds.
--//
--// r0 := (a0 + b0) / 2
--// r1 := (a1 + b1) / 2
--// ...
--// r7 := (a7 + b7) / 2
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
-+// Average packed unsigned 16-bit integers in a and b, and store the results in
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
- FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
- {
- return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
- vreinterpretq_u16_m128i(b));
- }
-
--// Computes the average of the 16 unsigned 8-bit integers in a and the 16
--// unsigned 8-bit integers in b and rounds.
--//
--// r0 := (a0 + b0) / 2
--// r1 := (a1 + b1) / 2
--// ...
--// r15 := (a15 + b15) / 2
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
-+// Average packed unsigned 8-bit integers in a and b, and store the results in
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
- FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u8(
-@@ -3321,17 +2981,17 @@ FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
-
- // Shift a left by imm8 bytes while shifting in zeros, and store the results in
- // dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
- #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
-
- // Shift a right by imm8 bytes while shifting in zeros, and store the results in
- // dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
- #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
-
- // Cast vector of type __m128d to type __m128. This intrinsic is only used for
- // compilation and does not generate any instructions, thus it has zero latency.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
- FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
- {
- return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
-@@ -3339,7 +2999,7 @@ FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
-
- // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
- // compilation and does not generate any instructions, thus it has zero latency.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
- FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
- {
- return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
-@@ -3347,15 +3007,15 @@ FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
-
- // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
- // compilation and does not generate any instructions, thus it has zero latency.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
- FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
- {
- return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
- }
-
--// Applies a type cast to reinterpret four 32-bit floating point values passed
--// in as a 128-bit parameter as packed 32-bit integers.
--// https://msdn.microsoft.com/en-us/library/bb514099.aspx
-+// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
-+// compilation and does not generate any instructions, thus it has zero latency.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
- FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
- {
- return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
-@@ -3363,7 +3023,7 @@ FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
-
- // Cast vector of type __m128i to type __m128d. This intrinsic is only used for
- // compilation and does not generate any instructions, thus it has zero latency.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
- FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
- {
- #if defined(__aarch64__)
-@@ -3373,9 +3033,9 @@ FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
- #endif
- }
-
--// Applies a type cast to reinterpret four 32-bit integers passed in as a
--// 128-bit parameter as packed 32-bit floating point values.
--// https://msdn.microsoft.com/en-us/library/bb514029.aspx
-+// Cast vector of type __m128i to type __m128. This intrinsic is only used for
-+// compilation and does not generate any instructions, thus it has zero latency.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
- FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
- {
- return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
-@@ -3406,9 +3066,9 @@ FORCE_INLINE void _mm_clflush(void const *p)
- #endif
- }
-
--// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
--// unsigned 16-bit integers in b for equality.
--// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
-+// Compare packed 16-bit integers in a and b for equality, and store the results
-+// in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
- FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u16(
-@@ -3416,16 +3076,17 @@ FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
- }
-
- // Compare packed 32-bit integers in a and b for equality, and store the results
--// in dst
-+// in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
- FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u32(
- vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
-
--// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
--// unsigned 8-bit integers in b for equality.
--// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
-+// Compare packed 8-bit integers in a and b for equality, and store the results
-+// in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
- FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u8(
-@@ -3434,7 +3095,7 @@ FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
-
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for equality, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
- FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3452,7 +3113,7 @@ FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for equality, store the result in the lower element of dst, and copy the
- // upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
- FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
- {
- return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
-@@ -3460,7 +3121,7 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
-
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for greater-than-or-equal, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
- FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3482,7 +3143,7 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for greater-than-or-equal, store the result in the lower element of dst,
- // and copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
- FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3500,39 +3161,27 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
- #endif
- }
-
--// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
--// in b for greater than.
--//
--// r0 := (a0 > b0) ? 0xffff : 0x0
--// r1 := (a1 > b1) ? 0xffff : 0x0
--// ...
--// r7 := (a7 > b7) ? 0xffff : 0x0
--//
--// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
-+// Compare packed signed 16-bit integers in a and b for greater-than, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
- FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u16(
- vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
-
--// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
--// in b for greater than.
--// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
-+// Compare packed signed 32-bit integers in a and b for greater-than, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
- FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u32(
- vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
-
--// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
--// in b for greater than.
--//
--// r0 := (a0 > b0) ? 0xff : 0x0
--// r1 := (a1 > b1) ? 0xff : 0x0
--// ...
--// r15 := (a15 > b15) ? 0xff : 0x0
--//
--// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
-+// Compare packed signed 8-bit integers in a and b for greater-than, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
- FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u8(
-@@ -3541,7 +3190,7 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
-
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for greater-than, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
- FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3563,7 +3212,7 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for greater-than, store the result in the lower element of dst, and copy
- // the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
- FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3583,7 +3232,7 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
-
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for less-than-or-equal, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
- FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3605,7 +3254,7 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for less-than-or-equal, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
- FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3623,34 +3272,30 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
- #endif
- }
-
--// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
--// in b for less than.
--//
--// r0 := (a0 < b0) ? 0xffff : 0x0
--// r1 := (a1 < b1) ? 0xffff : 0x0
--// ...
--// r7 := (a7 < b7) ? 0xffff : 0x0
--//
--// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
-+// Compare packed signed 16-bit integers in a and b for less-than, and store the
-+// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
-+// order of the operands switched.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
- FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u16(
- vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
-
--
--// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
--// in b for less than.
--// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
-+// Compare packed signed 32-bit integers in a and b for less-than, and store the
-+// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
-+// order of the operands switched.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
- FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u32(
- vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
-
--// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
--// in b for lesser than.
--// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
-+// Compare packed signed 8-bit integers in a and b for less-than, and store the
-+// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
-+// order of the operands switched.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
- FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u8(
-@@ -3659,7 +3304,7 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
-
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for less-than, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
- FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3681,7 +3326,7 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for less-than, store the result in the lower element of dst, and copy the
- // upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
- FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3700,7 +3345,7 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
-
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for not-equal, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
- FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3718,7 +3363,7 @@ FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for not-equal, store the result in the lower element of dst, and copy the
- // upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
- FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
- {
- return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
-@@ -3726,7 +3371,7 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
-
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for not-greater-than-or-equal, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
- FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3751,7 +3396,7 @@ FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for not-greater-than-or-equal, store the result in the lower element of
- // dst, and copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
- FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
- {
- return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
-@@ -3759,7 +3404,7 @@ FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
-
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for not-greater-than, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
- FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3784,7 +3429,7 @@ FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for not-greater-than, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
- FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
- {
- return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
-@@ -3792,7 +3437,7 @@ FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
-
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for not-less-than-or-equal, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
- FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3817,7 +3462,7 @@ FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for not-less-than-or-equal, store the result in the lower element of dst,
- // and copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
- FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
- {
- return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
-@@ -3825,7 +3470,7 @@ FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
-
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for not-less-than, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
- FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3850,7 +3495,7 @@ FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for not-less-than, store the result in the lower element of dst, and copy
- // the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
- FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
- {
- return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
-@@ -3858,7 +3503,7 @@ FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
-
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // to see if neither is NaN, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
- FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3890,7 +3535,7 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b to see if neither is NaN, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
- FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3912,7 +3557,7 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
-
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // to see if either is NaN, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
- FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3945,7 +3590,7 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b to see if either is NaN, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
- FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3967,7 +3612,7 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
-
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for greater-than-or-equal, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
- FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3982,7 +3627,7 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
-
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for greater-than, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
- FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3997,7 +3642,7 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
-
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for less-than-or-equal, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
- FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4012,7 +3657,7 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
-
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for less-than, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
- FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4027,7 +3672,7 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
-
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for equality, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
- FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4048,7 +3693,7 @@ FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
-
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for not-equal, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
- FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
- {
- return !_mm_comieq_sd(a, b);
-@@ -4056,14 +3701,7 @@ FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
-
- // Convert packed signed 32-bit integers in a to packed double-precision
- // (64-bit) floating-point elements, and store the results in dst.
--//
--// FOR j := 0 to 1
--// i := j*32
--// m := j*64
--// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
- FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
- {
- #if defined(__aarch64__)
-@@ -4076,9 +3714,9 @@ FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
- #endif
- }
-
--// Converts the four signed 32-bit integer values of a to single-precision,
--// floating-point values
--// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
-+// Convert packed signed 32-bit integers in a to packed single-precision
-+// (32-bit) floating-point elements, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
- FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
- {
- return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
-@@ -4086,14 +3724,7 @@ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
-
- // Convert packed double-precision (64-bit) floating-point elements in a to
- // packed 32-bit integers, and store the results in dst.
--//
--// FOR j := 0 to 1
--// i := 32*j
--// k := 64*j
--// dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
- FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
- {
- // vrnd32xq_f64 not supported on clang
-@@ -4112,14 +3743,7 @@ FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
-
- // Convert packed double-precision (64-bit) floating-point elements in a to
- // packed 32-bit integers, and store the results in dst.
--//
--// FOR j := 0 to 1
--// i := 32*j
--// k := 64*j
--// dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
- FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
- {
- __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-@@ -4132,15 +3756,7 @@ FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
- // Convert packed double-precision (64-bit) floating-point elements in a to
- // packed single-precision (32-bit) floating-point elements, and store the
- // results in dst.
--//
--// FOR j := 0 to 1
--// i := 32*j
--// k := 64*j
--// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
--// ENDFOR
--// dst[127:64] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
- FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -4155,14 +3771,7 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
-
- // Convert packed signed 32-bit integers in a to packed double-precision
- // (64-bit) floating-point elements, and store the results in dst.
--//
--// FOR j := 0 to 1
--// i := j*32
--// m := j*64
--// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
- FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
- {
- #if defined(__aarch64__)
-@@ -4175,15 +3784,9 @@ FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
- #endif
- }
-
--// Converts the four single-precision, floating-point values of a to signed
--// 32-bit integer values.
--//
--// r0 := (int) a0
--// r1 := (int) a1
--// r2 := (int) a2
--// r3 := (int) a3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
-+// Convert packed single-precision (32-bit) floating-point elements in a to
-+// packed 32-bit integers, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
- // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
- // does not support! It is supported on ARMv8-A however.
- FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
-@@ -4240,14 +3843,7 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed double-precision (64-bit) floating-point elements, and store the
- // results in dst.
--//
--// FOR j := 0 to 1
--// i := 64*j
--// k := 32*j
--// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
- FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
- {
- #if defined(__aarch64__)
-@@ -4261,10 +3857,7 @@ FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
- }
-
- // Copy the lower double-precision (64-bit) floating-point element of a to dst.
--//
--// dst[63:0] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
- FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -4276,10 +3869,7 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
-
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 32-bit integer, and store the result in dst.
--//
--// dst[31:0] := Convert_FP64_To_Int32(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
- FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -4293,10 +3883,7 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
-
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 64-bit integer, and store the result in dst.
--//
--// dst[63:0] := Convert_FP64_To_Int64(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
- FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -4310,17 +3897,14 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
-
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 64-bit integer, and store the result in dst.
--//
--// dst[63:0] := Convert_FP64_To_Int64(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
- #define _mm_cvtsd_si64x _mm_cvtsd_si64
-
- // Convert the lower double-precision (64-bit) floating-point element in b to a
- // single-precision (32-bit) floating-point element, store the result in the
- // lower element of dst, and copy the upper 3 packed elements from a to the
- // upper elements of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
- FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4334,33 +3918,27 @@ FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
- }
-
- // Copy the lower 32-bit integer in a to dst.
--//
--// dst[31:0] := a[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
- FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
- {
- return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
- }
-
- // Copy the lower 64-bit integer in a to dst.
--//
--// dst[63:0] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
- FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
- {
- return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
- }
-
- // Copy the lower 64-bit integer in a to dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
- #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
-
- // Convert the signed 32-bit integer b to a double-precision (64-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
- FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
- {
- #if defined(__aarch64__)
-@@ -4374,21 +3952,12 @@ FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
- }
-
- // Copy the lower 64-bit integer in a to dst.
--//
--// dst[63:0] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
- #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
-
--// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
--// zero extending the upper bits.
--//
--// r0 := a
--// r1 := 0x0
--// r2 := 0x0
--// r3 := 0x0
--//
--// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
-+// Copy 32-bit integer a to the lower elements of dst, and zero the upper
-+// elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
- FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
- {
- return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
-@@ -4397,7 +3966,7 @@ FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
- // Convert the signed 64-bit integer b to a double-precision (64-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
- FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
- {
- #if defined(__aarch64__)
-@@ -4410,11 +3979,9 @@ FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
- #endif
- }
-
--// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
--// zero extending the upper bits.
--//
--// r0 := a
--// r1 := 0x0
-+// Copy 64-bit integer a to the lower element of dst, and zero the upper
-+// element.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
- FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
- {
- return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
-@@ -4422,24 +3989,20 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
-
- // Copy 64-bit integer a to the lower element of dst, and zero the upper
- // element.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
- #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
-
- // Convert the signed 64-bit integer b to a double-precision (64-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
- #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
-
- // Convert the lower single-precision (32-bit) floating-point element in b to a
- // double-precision (64-bit) floating-point element, store the result in the
- // lower element of dst, and copy the upper element from a to the upper element
- // of dst.
--//
--// dst[63:0] := Convert_FP32_To_FP64(b[31:0])
--// dst[127:64] := a[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
- FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
- {
- double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-@@ -4454,7 +4017,7 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
-
- // Convert packed double-precision (64-bit) floating-point elements in a to
- // packed 32-bit integers with truncation, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
- FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
- {
- double a0 = ((double *) &a)[0];
-@@ -4464,7 +4027,7 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
-
- // Convert packed double-precision (64-bit) floating-point elements in a to
- // packed 32-bit integers with truncation, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
- FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
- {
- double a0 = ((double *) &a)[0];
-@@ -4473,9 +4036,9 @@ FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
- return vreinterpret_m64_s32(vld1_s32(data));
- }
-
--// Converts the four single-precision, floating-point values of a to signed
--// 32-bit integer values using truncate.
--// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
-+// Convert packed single-precision (32-bit) floating-point elements in a to
-+// packed 32-bit integers with truncation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
- FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
- {
- return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
-@@ -4483,10 +4046,7 @@ FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
-
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 32-bit integer with truncation, and store the result in dst.
--//
--// dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
- FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
- {
- double ret = *((double *) &a);
-@@ -4495,10 +4055,7 @@ FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
-
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 64-bit integer with truncation, and store the result in dst.
--//
--// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
- FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -4511,21 +4068,12 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
-
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 64-bit integer with truncation, and store the result in dst.
--//
--// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
- #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
-
- // Divide packed double-precision (64-bit) floating-point elements in a by
- // packed elements in b, and store the results in dst.
--//
--// FOR j := 0 to 1
--// i := 64*j
--// dst[i+63:i] := a[i+63:i] / b[i+63:i]
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
- FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4545,7 +4093,7 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
- // lower double-precision (64-bit) floating-point element in b, store the result
- // in the lower element of dst, and copy the upper element from a to the upper
- // element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
- FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4558,16 +4106,16 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
- #endif
- }
-
--// Extracts the selected signed or unsigned 16-bit integer from a and zero
--// extends.
--// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
-+// Extract a 16-bit integer from a, selected with imm8, and store the result in
-+// the lower element of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
- // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
- #define _mm_extract_epi16(a, imm) \
- vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
-
--// Inserts the least significant 16 bits of b into the selected 16-bit integer
--// of a.
--// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
-+// Copy a to dst, and insert the 16-bit integer i into dst at the location
-+// specified by imm8.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
- // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
- // __constrange(0,8) int imm)
- #define _mm_insert_epi16(a, b, imm) \
-@@ -4576,12 +4124,10 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
- vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
- })
-
--// Loads two double-precision from 16-byte aligned memory, floating-point
--// values.
--//
--// dst[127:0] := MEM[mem_addr+127:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
-+// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
-+// boundary or a general-protection exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
- FORCE_INLINE __m128d _mm_load_pd(const double *p)
- {
- #if defined(__aarch64__)
-@@ -4595,21 +4141,13 @@ FORCE_INLINE __m128d _mm_load_pd(const double *p)
-
- // Load a double-precision (64-bit) floating-point element from memory into both
- // elements of dst.
--//
--// dst[63:0] := MEM[mem_addr+63:mem_addr]
--// dst[127:64] := MEM[mem_addr+63:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
- #define _mm_load_pd1 _mm_load1_pd
-
- // Load a double-precision (64-bit) floating-point element from memory into the
- // lower of dst, and zero the upper element. mem_addr does not need to be
- // aligned on any particular boundary.
--//
--// dst[63:0] := MEM[mem_addr+63:mem_addr]
--// dst[127:64] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
- FORCE_INLINE __m128d _mm_load_sd(const double *p)
- {
- #if defined(__aarch64__)
-@@ -4621,8 +4159,9 @@ FORCE_INLINE __m128d _mm_load_sd(const double *p)
- #endif
- }
-
--// Loads 128-bit value. :
--// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
-+// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
-+// on a 16-byte boundary or a general-protection exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
- FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
- {
- return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-@@ -4630,11 +4169,7 @@ FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
-
- // Load a double-precision (64-bit) floating-point element from memory into both
- // elements of dst.
--//
--// dst[63:0] := MEM[mem_addr+63:mem_addr]
--// dst[127:64] := MEM[mem_addr+63:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
- FORCE_INLINE __m128d _mm_load1_pd(const double *p)
- {
- #if defined(__aarch64__)
-@@ -4647,11 +4182,7 @@ FORCE_INLINE __m128d _mm_load1_pd(const double *p)
- // Load a double-precision (64-bit) floating-point element from memory into the
- // upper element of dst, and copy the lower element from a to dst. mem_addr does
- // not need to be aligned on any particular boundary.
--//
--// dst[63:0] := a[63:0]
--// dst[127:64] := MEM[mem_addr+63:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
- FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
- {
- #if defined(__aarch64__)
-@@ -4664,7 +4195,7 @@ FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
- }
-
- // Load 64-bit integer from memory into the first element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
- FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
- {
- /* Load the lower 64 bits of the value pointed to by p into the
-@@ -4677,11 +4208,7 @@ FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
- // Load a double-precision (64-bit) floating-point element from memory into the
- // lower element of dst, and copy the upper element from a to dst. mem_addr does
- // not need to be aligned on any particular boundary.
--//
--// dst[63:0] := MEM[mem_addr+63:mem_addr]
--// dst[127:64] := a[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
- FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
- {
- #if defined(__aarch64__)
-@@ -4697,11 +4224,7 @@ FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
- // Load 2 double-precision (64-bit) floating-point elements from memory into dst
- // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
- // general-protection exception may be generated.
--//
--// dst[63:0] := MEM[mem_addr+127:mem_addr+64]
--// dst[127:64] := MEM[mem_addr+63:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
- FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
- {
- #if defined(__aarch64__)
-@@ -4714,39 +4237,32 @@ FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
- }
-
- // Loads two double-precision from unaligned memory, floating-point values.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
- FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
- {
- return _mm_load_pd(p);
- }
-
--// Loads 128-bit value. :
--// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
-+// Load 128-bits of integer data from memory into dst. mem_addr does not need to
-+// be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
- FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
- {
- return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
- }
-
- // Load unaligned 32-bit integer from memory into the first element of dst.
--//
--// dst[31:0] := MEM[mem_addr+31:mem_addr]
--// dst[MAX:32] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
- FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
- {
- return vreinterpretq_m128i_s32(
- vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
- }
-
--// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
--// integers from b.
--//
--// r0 := (a0 * b0) + (a1 * b1)
--// r1 := (a2 * b2) + (a3 * b3)
--// r2 := (a4 * b4) + (a5 * b5)
--// r3 := (a6 * b6) + (a7 * b7)
--// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
-+// Multiply packed signed 16-bit integers in a and b, producing intermediate
-+// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
-+// 32-bit integers, and pack the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
- FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
- {
- int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-@@ -4771,7 +4287,7 @@ FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
- // (elements are not stored when the highest bit is not set in the corresponding
- // element) and a non-temporal memory hint. mem_addr does not need to be aligned
- // on any particular boundary.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
- FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
- {
- int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
-@@ -4782,18 +4298,18 @@ FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
- vst1q_s8((int8_t *) mem_addr, masked);
- }
-
--// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
--// signed 16-bit integers from b.
--// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
-+// Compare packed signed 16-bit integers in a and b, and store packed maximum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
- FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s16(
- vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
-
--// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
--// 16 unsigned 8-bit integers from b.
--// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
-+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
- FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u8(
-@@ -4802,7 +4318,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
-
- // Compare packed double-precision (64-bit) floating-point elements in a and b,
- // and store packed maximum values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
- FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4830,7 +4346,7 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b, store the maximum value in the lower element of dst, and copy the upper
- // element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
- FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4843,18 +4359,18 @@ FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
- #endif
- }
-
--// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
--// signed 16-bit integers from b.
--// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
-+// Compare packed signed 16-bit integers in a and b, and store packed minimum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
- FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s16(
- vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
-
--// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
--// 16 unsigned 8-bit integers from b.
--// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
-+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
- FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u8(
-@@ -4863,7 +4379,7 @@ FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
-
- // Compare packed double-precision (64-bit) floating-point elements in a and b,
- // and store packed minimum values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
- FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4890,7 +4406,7 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b, store the minimum value in the lower element of dst, and copy the upper
- // element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
- FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4905,11 +4421,7 @@ FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
-
- // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
- // upper element.
--//
--// dst[63:0] := a[63:0]
--// dst[127:64] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
- FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
- {
- return vreinterpretq_m128i_s64(
-@@ -4919,11 +4431,7 @@ FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
- // Move the lower double-precision (64-bit) floating-point element from b to the
- // lower element of dst, and copy the upper element from a to the upper element
- // of dst.
--//
--// dst[63:0] := b[63:0]
--// dst[127:64] := a[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
- FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
- {
- return vreinterpretq_m128d_f32(
-@@ -4931,10 +4439,9 @@ FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
- vget_high_f32(vreinterpretq_f32_m128d(a))));
- }
-
--// NEON does not provide a version of this function.
--// Creates a 16-bit mask from the most significant bits of the 16 signed or
--// unsigned 8-bit integers in a and zero extends the upper bits.
--// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
-+// Create mask from the most significant bit of each 8-bit element in a, and
-+// store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
- FORCE_INLINE int _mm_movemask_epi8(__m128i a)
- {
- // Use increasingly wide shifts+adds to collect the sign bits
-@@ -5017,7 +4524,7 @@ FORCE_INLINE int _mm_movemask_epi8(__m128i a)
-
- // Set each bit of mask dst based on the most significant bit of the
- // corresponding packed double-precision (64-bit) floating-point element in a.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
- FORCE_INLINE int _mm_movemask_pd(__m128d a)
- {
- uint64x2_t input = vreinterpretq_u64_m128d(a);
-@@ -5026,10 +4533,7 @@ FORCE_INLINE int _mm_movemask_pd(__m128d a)
- }
-
- // Copy the lower 64-bit integer in a to dst.
--//
--// dst[63:0] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
- FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
- {
- return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
-@@ -5037,11 +4541,7 @@ FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
-
- // Copy the 64-bit integer a to the lower element of dst, and zero the upper
- // element.
--//
--// dst[63:0] := a[63:0]
--// dst[127:64] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
- FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
- {
- return vreinterpretq_m128i_s64(
-@@ -5050,9 +4550,7 @@ FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
-
- // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
- // a and b, and store the unsigned 64-bit results in dst.
--//
--// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
--// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
- FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
- {
- // vmull_u32 upcasts instead of masking, so we downcast.
-@@ -5063,7 +4561,7 @@ FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
-
- // Multiply packed double-precision (64-bit) floating-point elements in a and b,
- // and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
- FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -5082,7 +4580,7 @@ FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
- // Multiply the lower double-precision (64-bit) floating-point element in a and
- // b, store the result in the lower element of dst, and copy the upper element
- // from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
- FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
- {
- return _mm_move_sd(a, _mm_mul_pd(a, b));
-@@ -5090,25 +4588,17 @@ FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
-
- // Multiply the low unsigned 32-bit integers from a and b, and store the
- // unsigned 64-bit result in dst.
--//
--// dst[63:0] := a[31:0] * b[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
- FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
- {
- return vreinterpret_m64_u64(vget_low_u64(
- vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
- }
-
--// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
--// integers from b.
--//
--// r0 := (a0 * b0)[31:16]
--// r1 := (a1 * b1)[31:16]
--// ...
--// r7 := (a7 * b7)[31:16]
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
-+// Multiply the packed signed 16-bit integers in a and b, producing intermediate
-+// 32-bit integers, and store the high 16 bits of the intermediate integers in
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
- FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
- {
- /* FIXME: issue with large values because of result saturation */
-@@ -5129,7 +4619,7 @@ FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
- // Multiply the packed unsigned 16-bit integers in a and b, producing
- // intermediate 32-bit integers, and store the high 16 bits of the intermediate
- // integers in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
- FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
- {
- uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
-@@ -5151,15 +4641,9 @@ FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
- #endif
- }
-
--// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
--// unsigned 16-bit integers from b.
--//
--// r0 := (a0 * b0)[15:0]
--// r1 := (a1 * b1)[15:0]
--// ...
--// r7 := (a7 * b7)[15:0]
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
-+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
-+// integers, and store the low 16 bits of the intermediate integers in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
- FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s16(
-@@ -5168,27 +4652,25 @@ FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
-
- // Compute the bitwise OR of packed double-precision (64-bit) floating-point
- // elements in a and b, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
- FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
- {
- return vreinterpretq_m128d_s64(
- vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
- }
-
--// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
--//
--// r := a | b
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
-+// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
-+// and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
- FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s32(
- vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
-
--// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
--// saturates.
--// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
-+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
-+// using signed saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
- FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s8(
-@@ -5196,19 +4678,9 @@ FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
- vqmovn_s16(vreinterpretq_s16_m128i(b))));
- }
-
--// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
--// and saturates.
--//
--// r0 := SignedSaturate(a0)
--// r1 := SignedSaturate(a1)
--// r2 := SignedSaturate(a2)
--// r3 := SignedSaturate(a3)
--// r4 := SignedSaturate(b0)
--// r5 := SignedSaturate(b1)
--// r6 := SignedSaturate(b2)
--// r7 := SignedSaturate(b3)
--//
--// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
-+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
-+// using signed saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
- FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s16(
-@@ -5216,19 +4688,9 @@ FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
- vqmovn_s32(vreinterpretq_s32_m128i(b))));
- }
-
--// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
--// integers and saturates.
--//
--// r0 := UnsignedSaturate(a0)
--// r1 := UnsignedSaturate(a1)
--// ...
--// r7 := UnsignedSaturate(a7)
--// r8 := UnsignedSaturate(b0)
--// r9 := UnsignedSaturate(b1)
--// ...
--// r15 := UnsignedSaturate(b7)
--//
--// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
-+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
-+// using unsigned saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
- FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
- {
- return vreinterpretq_m128i_u8(
-@@ -5241,6 +4703,7 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
- // 'yield' instruction isn't a good fit because it's effectively a nop on most
- // Arm cores. Experience with several databases has shown has shown an 'isb' is
- // a reasonable approximation.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
- FORCE_INLINE void _mm_pause()
- {
- __asm__ __volatile__("isb\n");
-@@ -5250,15 +4713,15 @@ FORCE_INLINE void _mm_pause()
- // b, then horizontally sum each consecutive 8 differences to produce two
- // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
- // 16 bits of 64-bit elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
- FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
- {
- uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
- return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
- }
-
--// Sets the 8 signed 16-bit integer values.
--// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
-+// Set packed 16-bit integers in dst with the supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
- FORCE_INLINE __m128i _mm_set_epi16(short i7,
- short i6,
- short i5,
-@@ -5272,33 +4735,31 @@ FORCE_INLINE __m128i _mm_set_epi16(short i7,
- return vreinterpretq_m128i_s16(vld1q_s16(data));
- }
-
--// Sets the 4 signed 32-bit integer values.
--// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
-+// Set packed 32-bit integers in dst with the supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
- FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
- {
- int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
- return vreinterpretq_m128i_s32(vld1q_s32(data));
- }
-
--// Returns the __m128i structure with its two 64-bit integer values
--// initialized to the values of the two 64-bit integers passed in.
--// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
-+// Set packed 64-bit integers in dst with the supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
- FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
- {
- return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
- }
-
--// Returns the __m128i structure with its two 64-bit integer values
--// initialized to the values of the two 64-bit integers passed in.
--// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
-+// Set packed 64-bit integers in dst with the supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
- FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
- {
- return vreinterpretq_m128i_s64(
- vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
- }
-
--// Sets the 16 signed 8-bit integer values.
--// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
-+// Set packed 8-bit integers in dst with the supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
- FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
- signed char b14,
- signed char b13,
-@@ -5326,7 +4787,7 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
-
- // Set packed double-precision (64-bit) floating-point elements in dst with the
- // supplied values.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
- FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
- {
- double ALIGN_STRUCT(16) data[2] = {e0, e1};
-@@ -5339,12 +4800,12 @@ FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
-
- // Broadcast double-precision (64-bit) floating-point value a to all elements of
- // dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
- #define _mm_set_pd1 _mm_set1_pd
-
- // Copy double-precision (64-bit) floating-point element a to the lower element
- // of dst, and zero the upper element.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
- FORCE_INLINE __m128d _mm_set_sd(double a)
- {
- #if defined(__aarch64__)
-@@ -5354,54 +4815,36 @@ FORCE_INLINE __m128d _mm_set_sd(double a)
- #endif
- }
-
--// Sets the 8 signed 16-bit integer values to w.
--//
--// r0 := w
--// r1 := w
--// ...
--// r7 := w
--//
--// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
-+// Broadcast 16-bit integer a to all all elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
- FORCE_INLINE __m128i _mm_set1_epi16(short w)
- {
- return vreinterpretq_m128i_s16(vdupq_n_s16(w));
- }
-
--// Sets the 4 signed 32-bit integer values to i.
--//
--// r0 := i
--// r1 := i
--// r2 := i
--// r3 := I
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
-+// Broadcast 32-bit integer a to all elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
- FORCE_INLINE __m128i _mm_set1_epi32(int _i)
- {
- return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
- }
-
--// Sets the 2 signed 64-bit integer values to i.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
-+// Broadcast 64-bit integer a to all elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
- FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
- {
- return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
- }
-
--// Sets the 2 signed 64-bit integer values to i.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
-+// Broadcast 64-bit integer a to all elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
- FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
- {
- return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
- }
-
--// Sets the 16 signed 8-bit integer values to b.
--//
--// r0 := b
--// r1 := b
--// ...
--// r15 := b
--//
--// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
-+// Broadcast 8-bit integer a to all elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
- FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
- {
- return vreinterpretq_m128i_s8(vdupq_n_s8(w));
-@@ -5409,7 +4852,7 @@ FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
-
- // Broadcast double-precision (64-bit) floating-point value a to all elements of
- // dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
- FORCE_INLINE __m128d _mm_set1_pd(double d)
- {
- #if defined(__aarch64__)
-@@ -5419,13 +4862,8 @@ FORCE_INLINE __m128d _mm_set1_pd(double d)
- #endif
- }
-
--// Sets the 8 signed 16-bit integer values in reverse order.
--//
--// Return Value
--// r0 := w0
--// r1 := w1
--// ...
--// r7 := w7
-+// Set packed 16-bit integers in dst with the supplied values in reverse order.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
- FORCE_INLINE __m128i _mm_setr_epi16(short w0,
- short w1,
- short w2,
-@@ -5439,8 +4877,8 @@ FORCE_INLINE __m128i _mm_setr_epi16(short w0,
- return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
- }
-
--// Sets the 4 signed 32-bit integer values in reverse order
--// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
-+// Set packed 32-bit integers in dst with the supplied values in reverse order.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
- FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
- {
- int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
-@@ -5448,14 +4886,14 @@ FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
- }
-
- // Set packed 64-bit integers in dst with the supplied values in reverse order.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
- FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
- {
- return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
- }
-
--// Sets the 16 signed 8-bit integer values in reverse order.
--// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
-+// Set packed 8-bit integers in dst with the supplied values in reverse order.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
- FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
- signed char b1,
- signed char b2,
-@@ -5483,14 +4921,14 @@ FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
-
- // Set packed double-precision (64-bit) floating-point elements in dst with the
- // supplied values in reverse order.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
- FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
- {
- return _mm_set_pd(e0, e1);
- }
-
- // Return vector of type __m128d with all elements set to zero.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
- FORCE_INLINE __m128d _mm_setzero_pd(void)
- {
- #if defined(__aarch64__)
-@@ -5500,15 +4938,16 @@ FORCE_INLINE __m128d _mm_setzero_pd(void)
- #endif
- }
-
--// Sets the 128-bit value to zero
--// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
-+// Return vector of type __m128i with all elements set to zero.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
- FORCE_INLINE __m128i _mm_setzero_si128(void)
- {
- return vreinterpretq_m128i_s32(vdupq_n_s32(0));
- }
-
--// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
--// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
-+// Shuffle 32-bit integers in a using the control in imm8, and store the results
-+// in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
- // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
- // __constrange(0,255) int imm)
- #ifdef _sse2neon_shuffle
-@@ -5577,11 +5016,7 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
-
- // Shuffle double-precision (64-bit) floating-point elements using the control
- // in imm8, and store the results in dst.
--//
--// dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
--// dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
- #ifdef _sse2neon_shuffle
- #define _mm_shuffle_pd(a, b, imm8) \
- vreinterpretq_m128d_s64( \
-@@ -5627,17 +5062,7 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
-
- // Shift packed 16-bit integers in a left by count while shifting in zeros, and
- // store the results in dst.
--//
--// FOR j := 0 to 7
--// i := j*16
--// IF count[63:0] > 15
--// dst[i+15:i] := 0
--// ELSE
--// dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
- FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
- {
- uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5650,17 +5075,7 @@ FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
-
- // Shift packed 32-bit integers in a left by count while shifting in zeros, and
- // store the results in dst.
--//
--// FOR j := 0 to 3
--// i := j*32
--// IF count[63:0] > 31
--// dst[i+31:i] := 0
--// ELSE
--// dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
- FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
- {
- uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5673,17 +5088,7 @@ FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
-
- // Shift packed 64-bit integers in a left by count while shifting in zeros, and
- // store the results in dst.
--//
--// FOR j := 0 to 1
--// i := j*64
--// IF count[63:0] > 63
--// dst[i+63:i] := 0
--// ELSE
--// dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
- FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
- {
- uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5696,17 +5101,7 @@ FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
-
- // Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--// FOR j := 0 to 7
--// i := j*16
--// IF imm8[7:0] > 15
--// dst[i+15:i] := 0
--// ELSE
--// dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
- FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
- {
- if (_sse2neon_unlikely(imm & ~15))
-@@ -5717,17 +5112,7 @@ FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
-
- // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--// FOR j := 0 to 3
--// i := j*32
--// IF imm8[7:0] > 31
--// dst[i+31:i] := 0
--// ELSE
--// dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
- FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
- {
- if (_sse2neon_unlikely(imm & ~31))
-@@ -5738,17 +5123,7 @@ FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
-
- // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--// FOR j := 0 to 1
--// i := j*64
--// IF imm8[7:0] > 63
--// dst[i+63:i] := 0
--// ELSE
--// dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
- FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
- {
- if (_sse2neon_unlikely(imm & ~63))
-@@ -5759,14 +5134,7 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
-
- // Shift a left by imm8 bytes while shifting in zeros, and store the results in
- // dst.
--//
--// tmp := imm8[7:0]
--// IF tmp > 15
--// tmp := 16
--// FI
--// dst[127:0] := a[127:0] << (tmp*8)
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
- #define _mm_slli_si128(a, imm) \
- __extension__({ \
- int8x16_t ret; \
-@@ -5782,7 +5150,7 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
-
- // Compute the square root of packed double-precision (64-bit) floating-point
- // elements in a, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
- FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -5797,7 +5165,7 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
- // Compute the square root of the lower double-precision (64-bit) floating-point
- // element in b, store the result in the lower element of dst, and copy the
- // upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
- FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -5809,17 +5177,7 @@ FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
-
- // Shift packed 16-bit integers in a right by count while shifting in sign bits,
- // and store the results in dst.
--//
--// FOR j := 0 to 7
--// i := j*16
--// IF count[63:0] > 15
--// dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
--// ELSE
--// dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
- FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
- {
- int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-@@ -5830,17 +5188,7 @@ FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
-
- // Shift packed 32-bit integers in a right by count while shifting in sign bits,
- // and store the results in dst.
--//
--// FOR j := 0 to 3
--// i := j*32
--// IF count[63:0] > 31
--// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
--// ELSE
--// dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
- FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
- {
- int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-@@ -5851,17 +5199,7 @@ FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
-
- // Shift packed 16-bit integers in a right by imm8 while shifting in sign
- // bits, and store the results in dst.
--//
--// FOR j := 0 to 7
--// i := j*16
--// IF imm8[7:0] > 15
--// dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
--// ELSE
--// dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
- FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
- {
- const int count = (imm & ~15) ? 15 : imm;
-@@ -5870,17 +5208,7 @@ FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
-
- // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
- // and store the results in dst.
--//
--// FOR j := 0 to 3
--// i := j*32
--// IF imm8[7:0] > 31
--// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
--// ELSE
--// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
- // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
- #define _mm_srai_epi32(a, imm) \
- __extension__({ \
-@@ -5899,17 +5227,7 @@ FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
-
- // Shift packed 16-bit integers in a right by count while shifting in zeros, and
- // store the results in dst.
--//
--// FOR j := 0 to 7
--// i := j*16
--// IF count[63:0] > 15
--// dst[i+15:i] := 0
--// ELSE
--// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
- FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
- {
- uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5922,17 +5240,7 @@ FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
-
- // Shift packed 32-bit integers in a right by count while shifting in zeros, and
- // store the results in dst.
--//
--// FOR j := 0 to 3
--// i := j*32
--// IF count[63:0] > 31
--// dst[i+31:i] := 0
--// ELSE
--// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
- FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
- {
- uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5945,17 +5253,7 @@ FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
-
- // Shift packed 64-bit integers in a right by count while shifting in zeros, and
- // store the results in dst.
--//
--// FOR j := 0 to 1
--// i := j*64
--// IF count[63:0] > 63
--// dst[i+63:i] := 0
--// ELSE
--// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
- FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
- {
- uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5968,17 +5266,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
-
- // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--// FOR j := 0 to 7
--// i := j*16
--// IF imm8[7:0] > 15
--// dst[i+15:i] := 0
--// ELSE
--// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
- #define _mm_srli_epi16(a, imm) \
- __extension__({ \
- __m128i ret; \
-@@ -5993,17 +5281,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
-
- // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--// FOR j := 0 to 3
--// i := j*32
--// IF imm8[7:0] > 31
--// dst[i+31:i] := 0
--// ELSE
--// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
- // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
- #define _mm_srli_epi32(a, imm) \
- __extension__({ \
-@@ -6019,17 +5297,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
-
- // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--// FOR j := 0 to 1
--// i := j*64
--// IF imm8[7:0] > 63
--// dst[i+63:i] := 0
--// ELSE
--// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
- #define _mm_srli_epi64(a, imm) \
- __extension__({ \
- __m128i ret; \
-@@ -6044,14 +5312,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
-
- // Shift a right by imm8 bytes while shifting in zeros, and store the results in
- // dst.
--//
--// tmp := imm8[7:0]
--// IF tmp > 15
--// tmp := 16
--// FI
--// dst[127:0] := a[127:0] >> (tmp*8)
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
- #define _mm_srli_si128(a, imm) \
- __extension__({ \
- int8x16_t ret; \
-@@ -6066,7 +5327,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
- // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
- // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
- // or a general-protection exception may be generated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
- FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
- {
- #if defined(__aarch64__)
-@@ -6079,7 +5340,7 @@ FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
- // Store the lower double-precision (64-bit) floating-point element from a into
- // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
- // boundary or a general-protection exception may be generated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
- FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
- {
- #if defined(__aarch64__)
-@@ -6095,7 +5356,7 @@ FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
-
- // Store the lower double-precision (64-bit) floating-point element from a into
- // memory. mem_addr does not need to be aligned on any particular boundary.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
- FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
- {
- #if defined(__aarch64__)
-@@ -6105,8 +5366,9 @@ FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
- #endif
- }
-
--// Stores four 32-bit integer values as (as a __m128i value) at the address p.
--// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-+// Store 128-bits of integer data from a into memory. mem_addr must be aligned
-+// on a 16-byte boundary or a general-protection exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
- FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
- {
- vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-@@ -6115,15 +5377,12 @@ FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
- // Store the lower double-precision (64-bit) floating-point element from a into
- // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
- // boundary or a general-protection exception may be generated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
- #define _mm_store1_pd _mm_store_pd1
-
- // Store the upper double-precision (64-bit) floating-point element from a into
- // memory.
--//
--// MEM[mem_addr+63:mem_addr] := a[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
- FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
- {
- #if defined(__aarch64__)
-@@ -6133,8 +5392,8 @@ FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
- #endif
- }
-
--// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
--// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
-+// Store 64-bit integer from the first element of a into memory.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
- FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
- {
- vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
-@@ -6142,10 +5401,7 @@ FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
-
- // Store the lower double-precision (64-bit) floating-point element from a into
- // memory.
--//
--// MEM[mem_addr+63:mem_addr] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
- FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
- {
- #if defined(__aarch64__)
-@@ -6158,11 +5414,7 @@ FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
- // Store 2 double-precision (64-bit) floating-point elements from a into memory
- // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
- // general-protection exception may be generated.
--//
--// MEM[mem_addr+63:mem_addr] := a[127:64]
--// MEM[mem_addr+127:mem_addr+64] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
- FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
- {
- float32x4_t f = vreinterpretq_f32_m128d(a);
-@@ -6172,21 +5424,23 @@ FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
- // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
- // elements) from a into memory. mem_addr does not need to be aligned on any
- // particular boundary.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
- FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
- {
- _mm_store_pd(mem_addr, a);
- }
-
--// Stores 128-bits of integer data a at the address p.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128
-+// Store 128-bits of integer data from a into memory. mem_addr does not need to
-+// be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
- FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
- {
- vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
- }
-
--// Stores 32-bits of integer data a at the address p.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32
-+// Store 32-bit integer from the first element of a into memory. mem_addr does
-+// not need to be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
- FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
- {
- vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
-@@ -6196,7 +5450,7 @@ FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
- // elements) from a into memory using a non-temporal memory hint. mem_addr must
- // be aligned on a 16-byte boundary or a general-protection exception may be
- // generated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
- FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
- {
- #if __has_builtin(__builtin_nontemporal_store)
-@@ -6208,10 +5462,10 @@ FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
- #endif
- }
-
--// Stores the data in a to the address p without polluting the caches. If the
--// cache line containing address p is already in the cache, the cache will be
--// updated.
--// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
-+// Store 128-bits of integer data from a into memory using a non-temporal memory
-+// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
-+// exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
- FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
- {
- #if __has_builtin(__builtin_nontemporal_store)
-@@ -6224,7 +5478,7 @@ FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
- // Store 32-bit integer a into memory using a non-temporal hint to minimize
- // cache pollution. If the cache line containing address mem_addr is already in
- // the cache, the cache will be updated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
- FORCE_INLINE void _mm_stream_si32(int *p, int a)
- {
- vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
-@@ -6233,7 +5487,7 @@ FORCE_INLINE void _mm_stream_si32(int *p, int a)
- // Store 64-bit integer a into memory using a non-temporal hint to minimize
- // cache pollution. If the cache line containing address mem_addr is already in
- // the cache, the cache will be updated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
- FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
- {
- vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
-@@ -6241,32 +5495,25 @@ FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
-
- // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
- // store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
- FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s16(
- vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
-
--// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
--// unsigned 32-bit integers of a.
--//
--// r0 := a0 - b0
--// r1 := a1 - b1
--// r2 := a2 - b2
--// r3 := a3 - b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
-+// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
- FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s32(
- vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
-
--// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
--// and store the results in dst.
--// r0 := a0 - b0
--// r1 := a1 - b1
-+// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
- FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s64(
-@@ -6275,7 +5522,7 @@ FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
-
- // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
- // store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
- FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s8(
-@@ -6285,13 +5532,7 @@ FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
- // Subtract packed double-precision (64-bit) floating-point elements in b from
- // packed double-precision (64-bit) floating-point elements in a, and store the
- // results in dst.
--//
--// FOR j := 0 to 1
--// i := j*64
--// dst[i+63:i] := a[i+63:i] - b[i+63:i]
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
- FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -6311,71 +5552,50 @@ FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
- // the lower double-precision (64-bit) floating-point element in a, store the
- // result in the lower element of dst, and copy the upper element from a to the
- // upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
- FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
- {
- return _mm_move_sd(a, _mm_sub_pd(a, b));
- }
-
- // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
--//
--// dst[63:0] := a[63:0] - b[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
- FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
- {
- return vreinterpret_m64_s64(
- vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
- }
-
--// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
--// of a and saturates.
--//
--// r0 := SignedSaturate(a0 - b0)
--// r1 := SignedSaturate(a1 - b1)
--// ...
--// r7 := SignedSaturate(a7 - b7)
--//
--// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
-+// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
-+// using saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
- FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s16(
- vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
-
--// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
--// of a and saturates.
--//
--// r0 := SignedSaturate(a0 - b0)
--// r1 := SignedSaturate(a1 - b1)
--// ...
--// r15 := SignedSaturate(a15 - b15)
--//
--// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
-+// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
-+// using saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
- FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s8(
- vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
- }
-
--// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
--// integers of a and saturates..
--// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
-+// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
-+// integers in a using saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
- FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u16(
- vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
- }
-
--// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
--// integers of a and saturates.
--//
--// r0 := UnsignedSaturate(a0 - b0)
--// r1 := UnsignedSaturate(a1 - b1)
--// ...
--// r15 := UnsignedSaturate(a15 - b15)
--//
--// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
-+// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
-+// integers in a using saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
- FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u8(
-@@ -6390,7 +5610,7 @@ FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
- #define _mm_ucomineq_sd _mm_comineq_sd
-
- // Return vector of type __m128d with undefined elements.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
- FORCE_INLINE __m128d _mm_undefined_pd(void)
- {
- #if defined(__GNUC__) || defined(__clang__)
-@@ -6404,19 +5624,9 @@ FORCE_INLINE __m128d _mm_undefined_pd(void)
- #endif
- }
-
--// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
--// upper 4 signed or unsigned 16-bit integers in b.
--//
--// r0 := a4
--// r1 := b4
--// r2 := a5
--// r3 := b5
--// r4 := a6
--// r5 := b6
--// r6 := a7
--// r7 := b7
--//
--// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
-+// Unpack and interleave 16-bit integers from the high half of a and b, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
- FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6430,9 +5640,9 @@ FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
- #endif
- }
-
--// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
--// upper 2 signed or unsigned 32-bit integers in b.
--// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
-+// Unpack and interleave 32-bit integers from the high half of a and b, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
- FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6446,30 +5656,24 @@ FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
- #endif
- }
-
--// Interleaves the upper signed or unsigned 64-bit integer in a with the
--// upper signed or unsigned 64-bit integer in b.
--//
--// r0 := a1
--// r1 := b1
-+// Unpack and interleave 64-bit integers from the high half of a and b, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
- FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
- {
-+#if defined(__aarch64__)
-+ return vreinterpretq_m128i_s64(
-+ vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-+#else
- int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
- int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
- return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
-+#endif
- }
-
--// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
--// 8 signed or unsigned 8-bit integers in b.
--//
--// r0 := a8
--// r1 := b8
--// r2 := a9
--// r3 := b9
--// ...
--// r14 := a15
--// r15 := b15
--//
--// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
-+// Unpack and interleave 8-bit integers from the high half of a and b, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
- FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6487,15 +5691,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
-
- // Unpack and interleave double-precision (64-bit) floating-point elements from
- // the high half of a and b, and store the results in dst.
--//
--// DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
--// dst[63:0] := src1[127:64]
--// dst[127:64] := src2[127:64]
--// RETURN dst[127:0]
--// }
--// dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
- FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -6508,19 +5704,9 @@ FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
- #endif
- }
-
--// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
--// lower 4 signed or unsigned 16-bit integers in b.
--//
--// r0 := a0
--// r1 := b0
--// r2 := a1
--// r3 := b1
--// r4 := a2
--// r5 := b2
--// r6 := a3
--// r7 := b3
--//
--// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
-+// Unpack and interleave 16-bit integers from the low half of a and b, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
- FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6534,15 +5720,9 @@ FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
- #endif
- }
-
--// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
--// lower 2 signed or unsigned 32 - bit integers in b.
--//
--// r0 := a0
--// r1 := b0
--// r2 := a1
--// r3 := b1
--//
--// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
-+// Unpack and interleave 32-bit integers from the low half of a and b, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
- FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6556,25 +5736,24 @@ FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
- #endif
- }
-
-+// Unpack and interleave 64-bit integers from the low half of a and b, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
- FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
- {
-+#if defined(__aarch64__)
-+ return vreinterpretq_m128i_s64(
-+ vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-+#else
- int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
- int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
- return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
-+#endif
- }
-
--// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
--// 8 signed or unsigned 8-bit integers in b.
--//
--// r0 := a0
--// r1 := b0
--// r2 := a1
--// r3 := b1
--// ...
--// r14 := a7
--// r15 := b7
--//
--// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
-+// Unpack and interleave 8-bit integers from the low half of a and b, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
- FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6590,15 +5769,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
-
- // Unpack and interleave double-precision (64-bit) floating-point elements from
- // the low half of a and b, and store the results in dst.
--//
--// DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
--// dst[63:0] := src1[63:0]
--// dst[127:64] := src2[63:0]
--// RETURN dst[127:0]
--// }
--// dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
- FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -6613,21 +5784,16 @@ FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
-
- // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
- // elements in a and b, and store the results in dst.
--//
--// FOR j := 0 to 1
--// i := j*64
--// dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
- FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
- {
- return vreinterpretq_m128d_s64(
- veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
- }
-
--// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
--// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
-+// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
-+// and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
- FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s32(
-@@ -6639,17 +5805,7 @@ FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
- // Alternatively add and subtract packed double-precision (64-bit)
- // floating-point elements in a to/from packed elements in b, and store the
- // results in dst.
--//
--// FOR j := 0 to 1
--// i := j*64
--// IF ((j & 1) == 0)
--// dst[i+63:i] := a[i+63:i] - b[i+63:i]
--// ELSE
--// dst[i+63:i] := a[i+63:i] + b[i+63:i]
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
- FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
- {
- _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
-@@ -6665,7 +5821,7 @@ FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
- // Alternatively add and subtract packed single-precision (32-bit)
- // floating-point elements in a to/from packed elements in b, and store the
- // results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
- FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
- {
- _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
-@@ -6680,7 +5836,7 @@ FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
-
- // Horizontally add adjacent pairs of double-precision (64-bit) floating-point
- // elements in a and b, and pack the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
- FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -6694,9 +5850,9 @@ FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
- #endif
- }
-
--// Computes pairwise add of each argument as single-precision, floating-point
--// values a and b.
--// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
-+// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
-+// elements in a and b, and pack the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
- FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
- {
- #if defined(__aarch64__)
-@@ -6714,7 +5870,7 @@ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
-
- // Horizontally subtract adjacent pairs of double-precision (64-bit)
- // floating-point elements in a and b, and pack the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
- FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
- {
- #if defined(__aarch64__)
-@@ -6732,7 +5888,7 @@ FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
-
- // Horizontally subtract adjacent pairs of single-precision (32-bit)
- // floating-point elements in a and b, and pack the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
- FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
- {
- float32x4_t a = vreinterpretq_f32_m128(_a);
-@@ -6749,24 +5905,17 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
- // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
- // may perform better than _mm_loadu_si128 when the data crosses a cache line
- // boundary.
--//
--// dst[127:0] := MEM[mem_addr+127:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
- #define _mm_lddqu_si128 _mm_loadu_si128
-
- // Load a double-precision (64-bit) floating-point element from memory into both
- // elements of dst.
--//
--// dst[63:0] := MEM[mem_addr+63:mem_addr]
--// dst[127:64] := MEM[mem_addr+63:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
- #define _mm_loaddup_pd _mm_load1_pd
-
- // Duplicate the low double-precision (64-bit) floating-point element from a,
- // and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
- FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -6780,7 +5929,7 @@ FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
-
- // Duplicate odd-indexed single-precision (32-bit) floating-point elements
- // from a, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
- FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
- {
- #if defined(__aarch64__)
-@@ -6799,7 +5948,7 @@ FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
-
- // Duplicate even-indexed single-precision (32-bit) floating-point elements
- // from a, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
- FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
- {
- #if defined(__aarch64__)
-@@ -6820,13 +5969,7 @@ FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
-
- // Compute the absolute value of packed signed 16-bit integers in a, and store
- // the unsigned results in dst.
--//
--// FOR j := 0 to 7
--// i := j*16
--// dst[i+15:i] := ABS(a[i+15:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
- FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
- {
- return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
-@@ -6834,13 +5977,7 @@ FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
-
- // Compute the absolute value of packed signed 32-bit integers in a, and store
- // the unsigned results in dst.
--//
--// FOR j := 0 to 3
--// i := j*32
--// dst[i+31:i] := ABS(a[i+31:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
- FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
- {
- return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
-@@ -6848,13 +5985,7 @@ FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
-
- // Compute the absolute value of packed signed 8-bit integers in a, and store
- // the unsigned results in dst.
--//
--// FOR j := 0 to 15
--// i := j*8
--// dst[i+7:i] := ABS(a[i+7:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
- FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
- {
- return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
-@@ -6862,13 +5993,7 @@ FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
-
- // Compute the absolute value of packed signed 16-bit integers in a, and store
- // the unsigned results in dst.
--//
--// FOR j := 0 to 3
--// i := j*16
--// dst[i+15:i] := ABS(a[i+15:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
- FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
- {
- return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
-@@ -6876,13 +6001,7 @@ FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
-
- // Compute the absolute value of packed signed 32-bit integers in a, and store
- // the unsigned results in dst.
--//
--// FOR j := 0 to 1
--// i := j*32
--// dst[i+31:i] := ABS(a[i+31:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
- FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
- {
- return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
-@@ -6890,13 +6009,7 @@ FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
-
- // Compute the absolute value of packed signed 8-bit integers in a, and store
- // the unsigned results in dst.
--//
--// FOR j := 0 to 7
--// i := j*8
--// dst[i+7:i] := ABS(a[i+7:i])
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
- FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
- {
- return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
-@@ -6904,11 +6017,7 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
-
- // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
- // the result right by imm8 bytes, and store the low 16 bytes in dst.
--//
--// tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
--// dst[127:0] := tmp[127:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
- #define _mm_alignr_epi8(a, b, imm) \
- __extension__({ \
- uint8x16_t _a = vreinterpretq_u8_m128i(a); \
-@@ -6926,11 +6035,7 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
-
- // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
- // the result right by imm8 bytes, and store the low 8 bytes in dst.
--//
--// tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
--// dst[63:0] := tmp[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
- #define _mm_alignr_pi8(a, b, imm) \
- __extension__({ \
- __m64 ret; \
-@@ -6953,8 +6058,9 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
- ret; \
- })
-
--// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
--// values a and b.
-+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
-+// signed 16-bit results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
- FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
- {
- int16x8_t a = vreinterpretq_s16_m128i(_a);
-@@ -6968,8 +6074,9 @@ FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
- #endif
- }
-
--// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
--// values a and b.
-+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
-+// signed 32-bit results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
- FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
- {
- int32x4_t a = vreinterpretq_s32_m128i(_a);
-@@ -6985,7 +6092,7 @@ FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
-
- // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
- // signed 16-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
- FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
- {
- return vreinterpret_m64_s16(
-@@ -6994,15 +6101,16 @@ FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
-
- // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
- // signed 32-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
- FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
- {
- return vreinterpret_m64_s32(
- vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
- }
-
--// Computes saturated pairwise sub of each argument as a 16-bit signed
--// integer values a and b.
-+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
-+// saturation, and pack the signed 16-bit results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
- FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
- {
- #if defined(__aarch64__)
-@@ -7025,7 +6133,7 @@ FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
-
- // Horizontally add adjacent pairs of signed 16-bit integers in a and b using
- // saturation, and pack the signed 16-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
- FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
- {
- int16x4_t a = vreinterpret_s16_m64(_a);
-@@ -7040,7 +6148,7 @@ FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
-
- // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
- // the signed 16-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
- FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
- {
- int16x8_t a = vreinterpretq_s16_m128i(_a);
-@@ -7056,7 +6164,7 @@ FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
-
- // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
- // the signed 32-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
- FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
- {
- int32x4_t a = vreinterpretq_s32_m128i(_a);
-@@ -7072,7 +6180,7 @@ FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
-
- // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
- // the signed 16-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
- FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
- {
- int16x4_t a = vreinterpret_s16_m64(_a);
-@@ -7087,7 +6195,7 @@ FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
-
- // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
- // the signed 32-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
- FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
- {
- int32x2_t a = vreinterpret_s32_m64(_a);
-@@ -7100,9 +6208,9 @@ FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
- #endif
- }
-
--// Computes saturated pairwise difference of each argument as a 16-bit signed
--// integer values a and b.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
-+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
-+// using saturation, and pack the signed 16-bit results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
- FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
- {
- int16x8_t a = vreinterpretq_s16_m128i(_a);
-@@ -7118,7 +6226,7 @@ FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
-
- // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
- // using saturation, and pack the signed 16-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
- FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
- {
- int16x4_t a = vreinterpret_s16_m64(_a);
-@@ -7135,12 +6243,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
- // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
- // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
- // and pack the saturated results in dst.
--//
--// FOR j := 0 to 7
--// i := j*16
--// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
--// a[i+7:i]*b[i+7:i] )
--// ENDFOR
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
- FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
- {
- #if defined(__aarch64__)
-@@ -7179,7 +6282,7 @@ FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
- // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
- // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
- // pack the saturated results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
- FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
- {
- uint16x4_t a = vreinterpret_u16_m64(_a);
-@@ -7204,12 +6307,7 @@ FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
- // Multiply packed signed 16-bit integers in a and b, producing intermediate
- // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
- // the packed 16-bit integers in dst.
--//
--// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
--// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
--// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
--// ...
--// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
- FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
- {
- // Has issues due to saturation
-@@ -7233,7 +6331,7 @@ FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
- // Multiply packed signed 16-bit integers in a and b, producing intermediate
- // signed 32-bit integers. Truncate each intermediate integer to the 18 most
- // significant bits, round by adding 1, and store bits [16:1] to dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
- FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
- {
- int32x4_t mul_extend =
-@@ -7245,7 +6343,7 @@ FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
-
- // Shuffle packed 8-bit integers in a according to shuffle control mask in the
- // corresponding 8-bit element of b, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
- FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
- {
- int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
-@@ -7275,18 +6373,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
-
- // Shuffle packed 8-bit integers in a according to shuffle control mask in the
- // corresponding 8-bit element of b, and store the results in dst.
--//
--// FOR j := 0 to 7
--// i := j*8
--// IF b[i+7] == 1
--// dst[i+7:i] := 0
--// ELSE
--// index[2:0] := b[i+2:i]
--// dst[i+7:i] := a[index*8+7:index*8]
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
- FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
- {
- const int8x8_t controlMask =
-@@ -7299,16 +6386,7 @@ FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
- // 16-bit integer in b is negative, and store the results in dst.
- // Element in dst are zeroed out when the corresponding element
- // in b is zero.
--//
--// for i in 0..7
--// if b[i] < 0
--// r[i] := -a[i]
--// else if b[i] == 0
--// r[i] := 0
--// else
--// r[i] := a[i]
--// fi
--// done
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
- FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
- {
- int16x8_t a = vreinterpretq_s16_m128i(_a);
-@@ -7336,16 +6414,7 @@ FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
- // 32-bit integer in b is negative, and store the results in dst.
- // Element in dst are zeroed out when the corresponding element
- // in b is zero.
--//
--// for i in 0..3
--// if b[i] < 0
--// r[i] := -a[i]
--// else if b[i] == 0
--// r[i] := 0
--// else
--// r[i] := a[i]
--// fi
--// done
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
- FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
- {
- int32x4_t a = vreinterpretq_s32_m128i(_a);
-@@ -7374,16 +6443,7 @@ FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
- // 8-bit integer in b is negative, and store the results in dst.
- // Element in dst are zeroed out when the corresponding element
- // in b is zero.
--//
--// for i in 0..15
--// if b[i] < 0
--// r[i] := -a[i]
--// else if b[i] == 0
--// r[i] := 0
--// else
--// r[i] := a[i]
--// fi
--// done
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
- FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
- {
- int8x16_t a = vreinterpretq_s8_m128i(_a);
-@@ -7412,19 +6472,7 @@ FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
- // Negate packed 16-bit integers in a when the corresponding signed 16-bit
- // integer in b is negative, and store the results in dst. Element in dst are
- // zeroed out when the corresponding element in b is zero.
--//
--// FOR j := 0 to 3
--// i := j*16
--// IF b[i+15:i] < 0
--// dst[i+15:i] := -(a[i+15:i])
--// ELSE IF b[i+15:i] == 0
--// dst[i+15:i] := 0
--// ELSE
--// dst[i+15:i] := a[i+15:i]
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
- FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
- {
- int16x4_t a = vreinterpret_s16_m64(_a);
-@@ -7453,19 +6501,7 @@ FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
- // Negate packed 32-bit integers in a when the corresponding signed 32-bit
- // integer in b is negative, and store the results in dst. Element in dst are
- // zeroed out when the corresponding element in b is zero.
--//
--// FOR j := 0 to 1
--// i := j*32
--// IF b[i+31:i] < 0
--// dst[i+31:i] := -(a[i+31:i])
--// ELSE IF b[i+31:i] == 0
--// dst[i+31:i] := 0
--// ELSE
--// dst[i+31:i] := a[i+31:i]
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
- FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
- {
- int32x2_t a = vreinterpret_s32_m64(_a);
-@@ -7494,19 +6530,7 @@ FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
- // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
- // in b is negative, and store the results in dst. Element in dst are zeroed out
- // when the corresponding element in b is zero.
--//
--// FOR j := 0 to 7
--// i := j*8
--// IF b[i+7:i] < 0
--// dst[i+7:i] := -(a[i+7:i])
--// ELSE IF b[i+7:i] == 0
--// dst[i+7:i] := 0
--// ELSE
--// dst[i+7:i] := a[i+7:i]
--// FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
- FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
- {
- int8x8_t a = vreinterpret_s8_m64(_a);
-@@ -7536,15 +6560,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
-
- // Blend packed 16-bit integers from a and b using control mask imm8, and store
- // the results in dst.
--//
--// FOR j := 0 to 7
--// i := j*16
--// IF imm8[j]
--// dst[i+15:i] := b[i+15:i]
--// ELSE
--// dst[i+15:i] := a[i+15:i]
--// FI
--// ENDFOR
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
- // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
- // __constrange(0,255) int imm)
- #define _mm_blend_epi16(a, b, imm) \
-@@ -7565,7 +6581,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
-
- // Blend packed double-precision (64-bit) floating-point elements from a and b
- // using control mask imm8, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
- #define _mm_blend_pd(a, b, imm) \
- __extension__({ \
- const uint64_t _mask[2] = { \
-@@ -7579,7 +6595,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
-
- // Blend packed single-precision (32-bit) floating-point elements from a and b
- // using mask, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
- FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
- {
- const uint32_t ALIGN_STRUCT(16)
-@@ -7595,15 +6611,7 @@ FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
-
- // Blend packed 8-bit integers from a and b using mask, and store the results in
- // dst.
--//
--// FOR j := 0 to 15
--// i := j*8
--// IF mask[i+7]
--// dst[i+7:i] := b[i+7:i]
--// ELSE
--// dst[i+7:i] := a[i+7:i]
--// FI
--// ENDFOR
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
- FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
- {
- // Use a signed shift right to create a mask with the sign bit
-@@ -7616,7 +6624,7 @@ FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
-
- // Blend packed double-precision (64-bit) floating-point elements from a and b
- // using mask, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
- FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
- {
- uint64x2_t mask =
-@@ -7634,7 +6642,7 @@ FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
-
- // Blend packed single-precision (32-bit) floating-point elements from a and b
- // using mask, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
- FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
- {
- // Use a signed shift right to create a mask with the sign bit
-@@ -7648,7 +6656,7 @@ FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
- // Round the packed double-precision (64-bit) floating-point elements in a up
- // to an integer value, and store the results as packed double-precision
- // floating-point elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
- FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -7662,7 +6670,7 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
- // Round the packed single-precision (32-bit) floating-point elements in a up to
- // an integer value, and store the results as packed single-precision
- // floating-point elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
- FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
- {
- #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-@@ -7677,7 +6685,7 @@ FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
- // an integer value, store the result as a double-precision floating-point
- // element in the lower element of dst, and copy the upper element from a to the
- // upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
- FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
- {
- return _mm_move_sd(a, _mm_ceil_pd(b));
-@@ -7687,11 +6695,7 @@ FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
- // an integer value, store the result as a single-precision floating-point
- // element in the lower element of dst, and copy the upper 3 packed elements
- // from a to the upper elements of dst.
--//
--// dst[31:0] := CEIL(b[31:0])
--// dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
- FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_ceil_ps(b));
-@@ -7714,16 +6718,18 @@ FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
- #endif
- }
-
--// Converts the four signed 16-bit integers in the lower 64 bits to four signed
--// 32-bit integers.
-+// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
- FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
- {
- return vreinterpretq_m128i_s32(
- vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
- }
-
--// Converts the two signed 16-bit integers in the lower 32 bits two signed
--// 32-bit integers.
-+// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
- FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
- {
- int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
-@@ -7732,16 +6738,18 @@ FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
- return vreinterpretq_m128i_s64(s64x2);
- }
-
--// Converts the two signed 32-bit integers in the lower 64 bits to two signed
--// 64-bit integers.
-+// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
- FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
- {
- return vreinterpretq_m128i_s64(
- vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
- }
-
--// Converts the four unsigned 8-bit integers in the lower 16 bits to four
--// unsigned 32-bit integers.
-+// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
- FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
- {
- int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
-@@ -7749,8 +6757,9 @@ FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
- return vreinterpretq_m128i_s16(s16x8);
- }
-
--// Converts the four unsigned 8-bit integers in the lower 32 bits to four
--// unsigned 32-bit integers.
-+// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
- FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
- {
- int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
-@@ -7759,8 +6768,9 @@ FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
- return vreinterpretq_m128i_s32(s32x4);
- }
-
--// Converts the two signed 8-bit integers in the lower 32 bits to four
--// signed 64-bit integers.
-+// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
-+// integers, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
- FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
- {
- int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
-@@ -7770,16 +6780,18 @@ FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
- return vreinterpretq_m128i_s64(s64x2);
- }
-
--// Converts the four unsigned 16-bit integers in the lower 64 bits to four
--// unsigned 32-bit integers.
-+// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
-+// and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
- FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
- {
- return vreinterpretq_m128i_u32(
- vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
- }
-
--// Converts the two unsigned 16-bit integers in the lower 32 bits to two
--// unsigned 64-bit integers.
-+// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
-+// and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
- FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
- {
- uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
-@@ -7788,8 +6800,9 @@ FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
- return vreinterpretq_m128i_u64(u64x2);
- }
-
--// Converts the two unsigned 32-bit integers in the lower 64 bits to two
--// unsigned 64-bit integers.
-+// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
-+// and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
- FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
- {
- return vreinterpretq_m128i_u64(
-@@ -7798,7 +6811,7 @@ FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
-
- // Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
- // and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
- FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
- {
- uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */
-@@ -7806,9 +6819,9 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
- return vreinterpretq_m128i_u16(u16x8);
- }
-
--// Converts the four unsigned 8-bit integers in the lower 32 bits to four
--// unsigned 32-bit integers.
--// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
-+// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
-+// and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
- FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
- {
- uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
-@@ -7817,8 +6830,9 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
- return vreinterpretq_m128i_u32(u32x4);
- }
-
--// Converts the two unsigned 8-bit integers in the lower 16 bits to two
--// unsigned 64-bit integers.
-+// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed
-+// 64-bit integers, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
- FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
- {
- uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
-@@ -7831,7 +6845,7 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
- // Conditionally multiply the packed double-precision (64-bit) floating-point
- // elements in a and b using the high 4 bits in imm8, sum the four products, and
- // conditionally store the sum in dst using the low 4 bits of imm8.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
- FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
- {
- // Generate mask value from constant immediate bit value
-@@ -7877,7 +6891,7 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
- // Conditionally multiply the packed single-precision (32-bit) floating-point
- // elements in a and b using the high 4 bits in imm8, sum the four products,
- // and conditionally store the sum in dst using the low 4 bits of imm.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
- FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
- {
- #if defined(__aarch64__)
-@@ -7918,22 +6932,24 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
- return vreinterpretq_m128_f32(res);
- }
-
--// Extracts the selected signed or unsigned 32-bit integer from a and zero
--// extends.
-+// Extract a 32-bit integer from a, selected with imm8, and store the result in
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
- // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
- #define _mm_extract_epi32(a, imm) \
- vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
-
--// Extracts the selected signed or unsigned 64-bit integer from a and zero
--// extends.
-+// Extract a 64-bit integer from a, selected with imm8, and store the result in
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
- // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
- #define _mm_extract_epi64(a, imm) \
- vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
-
--// Extracts the selected signed or unsigned 8-bit integer from a and zero
--// extends.
--// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8
-+// Extract an 8-bit integer from a, selected with imm8, and store the result in
-+// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
-+// __constrange(0,16) int imm)
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
- #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
-
- // Extracts the selected single-precision (32-bit) floating-point from a.
-@@ -7943,7 +6959,7 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
- // Round the packed double-precision (64-bit) floating-point elements in a down
- // to an integer value, and store the results as packed double-precision
- // floating-point elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
- FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -7957,7 +6973,7 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
- // Round the packed single-precision (32-bit) floating-point elements in a down
- // to an integer value, and store the results as packed single-precision
- // floating-point elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
- FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
- {
- #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-@@ -7972,7 +6988,7 @@ FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
- // an integer value, store the result as a double-precision floating-point
- // element in the lower element of dst, and copy the upper element from a to the
- // upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
- FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
- {
- return _mm_move_sd(a, _mm_floor_pd(b));
-@@ -7982,18 +6998,15 @@ FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
- // an integer value, store the result as a single-precision floating-point
- // element in the lower element of dst, and copy the upper 3 packed elements
- // from a to the upper elements of dst.
--//
--// dst[31:0] := FLOOR(b[31:0])
--// dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
- FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
- {
- return _mm_move_ss(a, _mm_floor_ps(b));
- }
-
--// Inserts the least significant 32 bits of b into the selected 32-bit integer
--// of a.
-+// Copy a to dst, and insert the 32-bit integer i into dst at the location
-+// specified by imm8.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
- // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
- // __constrange(0,4) int imm)
- #define _mm_insert_epi32(a, b, imm) \
-@@ -8002,8 +7015,9 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
- vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
- })
-
--// Inserts the least significant 64 bits of b into the selected 64-bit integer
--// of a.
-+// Copy a to dst, and insert the 64-bit integer i into dst at the location
-+// specified by imm8.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
- // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
- // __constrange(0,2) int imm)
- #define _mm_insert_epi64(a, b, imm) \
-@@ -8012,8 +7026,9 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
- vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
- })
-
--// Inserts the least significant 8 bits of b into the selected 8-bit integer
--// of a.
-+// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
-+// location specified by imm8.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
- // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
- // __constrange(0,16) int imm)
- #define _mm_insert_epi8(a, b, imm) \
-@@ -8025,7 +7040,7 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
- // Copy a to tmp, then insert a single-precision (32-bit) floating-point
- // element from b into tmp using the control in imm8. Store tmp to dst using
- // the mask in imm8 (elements are zeroed out when the corresponding bit is set).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
- #define _mm_insert_ps(a, b, imm8) \
- __extension__({ \
- float32x4_t tmp1 = \
-@@ -8045,17 +7060,9 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
- vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \
- })
-
--// epi versions of min/max
--// Computes the pariwise maximums of the four signed 32-bit integer values of a
--// and b.
--//
--// A 128-bit parameter that can be defined with the following equations:
--// r0 := (a0 > b0) ? a0 : b0
--// r1 := (a1 > b1) ? a1 : b1
--// r2 := (a2 > b2) ? a2 : b2
--// r3 := (a3 > b3) ? a3 : b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
-+// Compare packed signed 32-bit integers in a and b, and store packed maximum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
- FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s32(
-@@ -8064,7 +7071,7 @@ FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
-
- // Compare packed signed 8-bit integers in a and b, and store packed maximum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
- FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s8(
-@@ -8073,7 +7080,7 @@ FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
-
- // Compare packed unsigned 16-bit integers in a and b, and store packed maximum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
- FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u16(
-@@ -8082,23 +7089,16 @@ FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
-
- // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
- FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u32(
- vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
- }
-
--// Computes the pariwise minima of the four signed 32-bit integer values of a
--// and b.
--//
--// A 128-bit parameter that can be defined with the following equations:
--// r0 := (a0 < b0) ? a0 : b0
--// r1 := (a1 < b1) ? a1 : b1
--// r2 := (a2 < b2) ? a2 : b2
--// r3 := (a3 < b3) ? a3 : b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
-+// Compare packed signed 32-bit integers in a and b, and store packed minimum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
- FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s32(
-@@ -8107,7 +7107,7 @@ FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
-
- // Compare packed signed 8-bit integers in a and b, and store packed minimum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
- FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s8(
-@@ -8116,7 +7116,7 @@ FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
-
- // Compare packed unsigned 16-bit integers in a and b, and store packed minimum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
- FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u16(
-@@ -8125,7 +7125,7 @@ FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
-
- // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
- FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u32(
-@@ -8134,21 +7134,7 @@ FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
-
- // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
- // in a, store the minimum and index in dst, and zero the remaining bits in dst.
--//
--// index[2:0] := 0
--// min[15:0] := a[15:0]
--// FOR j := 0 to 7
--// i := j*16
--// IF a[i+15:i] < min[15:0]
--// index[2:0] := j
--// min[15:0] := a[i+15:i]
--// FI
--// ENDFOR
--// dst[15:0] := min[15:0]
--// dst[18:16] := index[2:0]
--// dst[127:19] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
- FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
- {
- __m128i dst;
-@@ -8198,7 +7184,7 @@ FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
- // quadruplets from a. One quadruplet is selected from b starting at on the
- // offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
- // integers selected from a starting at the offset specified in imm8.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
- FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
- {
- uint8x16_t _a, _b;
-@@ -8278,9 +7264,7 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
-
- // Multiply the low signed 32-bit integers from each packed 64-bit element in
- // a and b, and store the signed 64-bit results in dst.
--//
--// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
--// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
- FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
- {
- // vmull_s32 upcasts instead of masking, so we downcast.
-@@ -8289,26 +7273,18 @@ FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
- return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
- }
-
--// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
--// unsigned 32-bit integers from b.
--// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
-+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
-+// integers, and store the low 32 bits of the intermediate integers in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
- FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_s32(
- vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
-
--// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
--// integers and saturates.
--//
--// r0 := UnsignedSaturate(a0)
--// r1 := UnsignedSaturate(a1)
--// r2 := UnsignedSaturate(a2)
--// r3 := UnsignedSaturate(a3)
--// r4 := UnsignedSaturate(b0)
--// r5 := UnsignedSaturate(b1)
--// r6 := UnsignedSaturate(b2)
--// r7 := UnsignedSaturate(b3)
-+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
-+// using unsigned saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
- FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
- {
- return vreinterpretq_m128i_u16(
-@@ -8319,7 +7295,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
- // Round the packed double-precision (64-bit) floating-point elements in a using
- // the rounding parameter, and store the results as packed double-precision
- // floating-point elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
- FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
- {
- #if defined(__aarch64__)
-@@ -8448,7 +7424,7 @@ FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
- // the rounding parameter, store the result as a double-precision floating-point
- // element in the lower element of dst, and copy the upper element from a to the
- // upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
- FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
- {
- return _mm_move_sd(a, _mm_round_pd(b, rounding));
-@@ -8468,7 +7444,7 @@ FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
- // (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress
- // exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
- // _MM_SET_ROUNDING_MODE
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
- FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
- {
- return _mm_move_ss(a, _mm_round_ps(b, rounding));
-@@ -8477,10 +7453,7 @@ FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
- // Load 128-bits of integer data from memory into dst using a non-temporal
- // memory hint. mem_addr must be aligned on a 16-byte boundary or a
- // general-protection exception may be generated.
--//
--// dst[127:0] := MEM[mem_addr+127:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
- FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
- {
- #if __has_builtin(__builtin_nontemporal_store)
-@@ -8492,7 +7465,7 @@ FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
-
- // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
- // all 1's, and return 1 if the result is zero, otherwise return 0.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
- FORCE_INLINE int _mm_test_all_ones(__m128i a)
- {
- return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
-@@ -8501,7 +7474,7 @@ FORCE_INLINE int _mm_test_all_ones(__m128i a)
-
- // Compute the bitwise AND of 128 bits (representing integer data) in a and
- // mask, and return 1 if the result is zero, otherwise return 0.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
- FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
- {
- int64x2_t a_and_mask =
-@@ -8514,7 +7487,7 @@ FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
- // the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
- // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
- // otherwise return 0.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
- FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
- {
- uint64x2_t zf =
-@@ -8529,7 +7502,7 @@ FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
- // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
- // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
- // otherwise set CF to 0. Return the CF value.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
- FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
- {
- int64x2_t s64 =
-@@ -8542,14 +7515,14 @@ FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
- // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
- // otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
- // otherwise return 0.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
- #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
-
- // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
- // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
- // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
- // otherwise set CF to 0. Return the ZF value.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
- FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
- {
- int64x2_t s64 =
-@@ -9028,7 +8001,7 @@ FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
- FORCE_INLINE int _sse2neon_clz(unsigned int x)
- {
- #if _MSC_VER
-- DWORD cnt = 0;
-+ unsigned long cnt = 0;
- if (_BitScanForward(&cnt, x))
- return cnt;
- return 32;
-@@ -9040,7 +8013,7 @@ FORCE_INLINE int _sse2neon_clz(unsigned int x)
- FORCE_INLINE int _sse2neon_ctz(unsigned int x)
- {
- #if _MSC_VER
-- DWORD cnt = 0;
-+ unsigned long cnt = 0;
- if (_BitScanReverse(&cnt, x))
- return 31 - cnt;
- return 32;
-@@ -9053,18 +8026,16 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
- {
- #if _MSC_VER
- unsigned long cnt;
--#ifdef defined(SSE2NEON_HAS_BITSCAN64)
-- (defined(_M_AMD64) || defined(__x86_64__))
-- if((_BitScanForward64(&cnt, x))
-- return (int)(cnt);
-+#if defined(SSE2NEON_HAS_BITSCAN64)
-+ if ((_BitScanForward64(&cnt, x))
-+ return (int)(cnt);
- #else
- if (_BitScanForward(&cnt, (unsigned long) (x)))
- return (int) cnt;
- if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
- return (int) (cnt + 32);
--#endif
-- return 64;
--#else
-+#endif /* SSE2NEON_HAS_BITSCAN64 */
-+#else /* assume GNU compatible compilers */
- return x != 0 ? __builtin_ctzll(x) : 64;
- #endif
- }
-@@ -9155,7 +8126,7 @@ FORCE_INLINE int _mm_cmpestrc(__m128i a,
-
- // Compare packed strings in a and b with lengths la and lb using the control
- // in imm8, and store the generated index in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestri
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
- FORCE_INLINE int _mm_cmpestri(__m128i a,
- int la,
- __m128i b,
-@@ -9168,7 +8139,7 @@ FORCE_INLINE int _mm_cmpestri(__m128i a,
-
- // Compare packed strings in a and b with lengths la and lb using the control
- // in imm8, and store the generated mask in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrm
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
- FORCE_INLINE __m128i
- _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
- {
-@@ -9324,8 +8295,8 @@ FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
- }
-
- // Starting with the initial value in crc, accumulates a CRC32 value for
--// unsigned 16-bit integer v.
--// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
-+// unsigned 16-bit integer v, and stores the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
- FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
- {
- #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-@@ -9342,8 +8313,8 @@ FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
- }
-
- // Starting with the initial value in crc, accumulates a CRC32 value for
--// unsigned 32-bit integer v.
--// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
-+// unsigned 32-bit integer v, and stores the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
- FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
- {
- #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-@@ -9360,8 +8331,8 @@ FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
- }
-
- // Starting with the initial value in crc, accumulates a CRC32 value for
--// unsigned 64-bit integer v.
--// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
-+// unsigned 64-bit integer v, and stores the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
- FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
- {
- #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-@@ -9376,8 +8347,8 @@ FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
- }
-
- // Starting with the initial value in crc, accumulates a CRC32 value for
--// unsigned 8-bit integer v.
--// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
-+// unsigned 8-bit integer v, and stores the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
- FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
- {
- #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-@@ -9486,43 +8457,61 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
-
- /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
- #define SSE2NEON_AES_H0(x) (x)
--static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
--static const uint8_t SSE2NEON_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
-+static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
-+static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
- #undef SSE2NEON_AES_H0
-
--// In the absence of crypto extensions, implement aesenc using regular neon
-+/* x_time function and matrix multiply function */
-+#if !defined(__aarch64__)
-+#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
-+#define SSE2NEON_MULTIPLY(x, y) \
-+ (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \
-+ ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \
-+ ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
-+ ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
-+#endif
-+
-+// In the absence of crypto extensions, implement aesenc using regular NEON
- // intrinsics instead. See:
- // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
- // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
--// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
--// for more information Reproduced with permission of the author.
-+// for more information.
- FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
- {
- #if defined(__aarch64__)
-- static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
-- 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
-- 0xc, 0x1, 0x6, 0xb};
-- static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-- 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
-+ static const uint8_t shift_rows[] = {
-+ 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
-+ 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
-+ };
-+ static const uint8_t ror32by8[] = {
-+ 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-+ 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-+ };
-
- uint8x16_t v;
- uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-- // shift rows
-+ /* shift rows */
- w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
-
-- // sub bytes
-- v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
-- v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
-- v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
-- v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
-+ /* sub bytes */
-+ // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
-+ // look up each of the table. After each lookup, we load the next table
-+ // which locates at the next 64-bytes. In the meantime, the index in the
-+ // table would be smaller than it was, so the index parameters of
-+ // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
-+ v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-+ // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
-+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
-
-- // mix columns
-+ /* mix columns */
- w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
- w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
- w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-
-- // add round key
-+ /* add round key */
- return vreinterpretq_m128i_u8(w) ^ RoundKey;
-
- #else /* ARMv7-A implementation for a table-based AES */
-@@ -9587,31 +8576,34 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
- FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
- {
- #if defined(__aarch64__)
-- static const uint8_t inv_shift_rows[] = {0x0, 0xd, 0xa, 0x7, 0x4, 0x1,
-- 0xe, 0xb, 0x8, 0x5, 0x2, 0xf,
-- 0xc, 0x9, 0x6, 0x3};
-- static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-- 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
-+ static const uint8_t inv_shift_rows[] = {
-+ 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-+ 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-+ };
-+ static const uint8_t ror32by8[] = {
-+ 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-+ 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-+ };
-
- uint8x16_t v;
- uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-- // shift rows
-+ // inverse shift rows
- w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
-
-- // sub bytes
-- v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_rsbox), w);
-- v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_rsbox + 0x40), w - 0x40);
-- v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_rsbox + 0x80), w - 0x80);
-- v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_rsbox + 0xc0), w - 0xc0);
-+ // inverse sub bytes
-+ v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
-
-+ // inverse mix columns
- // muliplying 'v' by 4 in GF(2^8)
- w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
- w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
- v ^= w;
- v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
-
-- // mix columns
- w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
- 0x1b); // muliplying 'v' by 2 in GF(2^8)
- w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-@@ -9621,35 +8613,29 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
- return vreinterpretq_m128i_u8(w) ^ RoundKey;
-
- #else /* ARMv7-A NEON implementation */
--/* FIXME: optimized for NEON */
--#define XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
--#define MULTIPLY(x, y) \
-- (((y & 1) * x) ^ ((y >> 1 & 1) * XT(x)) ^ ((y >> 2 & 1) * XT(XT(x))) ^ \
-- ((y >> 3 & 1) * XT(XT(XT(x)))) ^ ((y >> 4 & 1) * XT(XT(XT(XT(x))))))
--
-+ /* FIXME: optimized for NEON */
- uint8_t i, e, f, g, h, v[4][4];
- uint8_t *_a = (uint8_t *) &a;
- for (i = 0; i < 16; ++i) {
-- v[((i / 4) + (i % 4)) % 4][i % 4] = SSE2NEON_rsbox[_a[i]];
-+ v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
- }
-
-+ // inverse mix columns
- for (i = 0; i < 4; ++i) {
- e = v[i][0];
- f = v[i][1];
- g = v[i][2];
- h = v[i][3];
-
-- v[i][0] = MULTIPLY(e, 0x0e) ^ MULTIPLY(f, 0x0b) ^ MULTIPLY(g, 0x0d) ^
-- MULTIPLY(h, 0x09);
-- v[i][1] = MULTIPLY(e, 0x09) ^ MULTIPLY(f, 0x0e) ^ MULTIPLY(g, 0x0b) ^
-- MULTIPLY(h, 0x0d);
-- v[i][2] = MULTIPLY(e, 0x0d) ^ MULTIPLY(f, 0x09) ^ MULTIPLY(g, 0x0e) ^
-- MULTIPLY(h, 0x0b);
-- v[i][3] = MULTIPLY(e, 0x0b) ^ MULTIPLY(f, 0x0d) ^ MULTIPLY(g, 0x09) ^
-- MULTIPLY(h, 0x0e);
-+ v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
-+ SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
-+ v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
-+ SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
-+ v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
-+ SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
-+ v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
-+ SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
- }
--#undef XT
--#undef MULTIPLY
-
- return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
- #endif
-@@ -9657,7 +8643,7 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
-
- // Perform the last round of an AES encryption flow on data (state) in a using
- // the round key in RoundKey, and store the result in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
- FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
- {
- #if defined(__aarch64__)
-@@ -9673,59 +8659,166 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
- w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
-
- // sub bytes
-- v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
-- // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
-- v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
-- v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
-- v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
-+ v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
-
-- // add round key
-+ // add round key
- return vreinterpretq_m128i_u8(v) ^ RoundKey;
-
- #else /* ARMv7-A implementation */
- uint8_t v[16] = {
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
-- SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
-+ _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
- };
-
- return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
- #endif
- }
-
-+// Perform the last round of an AES decryption flow on data (state) in a using
-+// the round key in RoundKey, and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
-+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
-+{
-+#if defined(__aarch64__)
-+ static const uint8_t inv_shift_rows[] = {
-+ 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-+ 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-+ };
-+
-+ uint8x16_t v;
-+ uint8x16_t w = vreinterpretq_u8_m128i(a);
-+
-+ // inverse shift rows
-+ w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
-+
-+ // inverse sub bytes
-+ v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
-+
-+ // add round key
-+ return vreinterpretq_m128i_u8(v) ^ RoundKey;
-+
-+#else /* ARMv7-A NEON implementation */
-+ /* FIXME: optimized for NEON */
-+ uint8_t v[4][4];
-+ uint8_t *_a = (uint8_t *) &a;
-+ for (int i = 0; i < 16; ++i) {
-+ v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
-+ }
-+
-+ return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
-+#endif
-+}
-+
-+// Perform the InvMixColumns transformation on a and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
-+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
-+{
-+#if defined(__aarch64__)
-+ static const uint8_t ror32by8[] = {
-+ 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-+ 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-+ };
-+ uint8x16_t v = vreinterpretq_u8_m128i(a);
-+ uint8x16_t w;
-+
-+ // multiplying 'v' by 4 in GF(2^8)
-+ w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-+ w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
-+ v ^= w;
-+ v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
-+
-+ // multiplying 'v' by 2 in GF(2^8)
-+ w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-+ w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-+ w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-+ return vreinterpretq_m128i_u8(w);
-+
-+#else /* ARMv7-A NEON implementation */
-+ uint8_t i, e, f, g, h, v[4][4];
-+ vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
-+ for (i = 0; i < 4; ++i) {
-+ e = v[i][0];
-+ f = v[i][1];
-+ g = v[i][2];
-+ h = v[i][3];
-+
-+ v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
-+ SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
-+ v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
-+ SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
-+ v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
-+ SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
-+ v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
-+ SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
-+ }
-+
-+ return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
-+#endif
-+}
-+
-+// Assist in expanding the AES cipher key by computing steps towards generating
-+// a round key for encryption cipher using data from a and an 8-bit round
-+// constant specified in imm8, and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
-+//
- // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
- // This instruction generates a round key for AES encryption. See
- // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
- // for details.
--//
--// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
--FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
-+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
- {
-- uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
-- uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
-+#if defined(__aarch64__)
-+ uint8x16_t _a = vreinterpretq_u8_m128i(a);
-+ uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
-+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
-+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
-+ v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
-+
-+ uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
-+ uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
-+ uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
-+
-+ return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
-+
-+#else /* ARMv7-A NEON implementation */
-+ uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
-+ uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
- for (int i = 0; i < 4; ++i) {
-- ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
-- ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
-+ ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
-+ ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
- }
- return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
- ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
-+#endif
- }
- #undef SSE2NEON_AES_SBOX
- #undef SSE2NEON_AES_RSBOX
-
-+#if defined(__aarch64__)
-+#undef SSE2NEON_XT
-+#undef SSE2NEON_MULTIPLY
-+#endif
-+
- #else /* __ARM_FEATURE_CRYPTO */
- // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
- // AESMC and then manually applying the real key as an xor operation. This
-@@ -9750,7 +8843,9 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
- vreinterpretq_u8_m128i(RoundKey)));
- }
-
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
-+// Perform the last round of an AES encryption flow on data (state) in a using
-+// the round key in RoundKey, and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
- FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
- {
- return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
-@@ -9758,6 +8853,23 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
- RoundKey);
- }
-
-+// Perform the last round of an AES decryption flow on data (state) in a using
-+// the round key in RoundKey, and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
-+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
-+{
-+ return vreinterpretq_m128i_u8(
-+ vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)) ^
-+ vreinterpretq_u8_m128i(RoundKey));
-+}
-+
-+// Perform the InvMixColumns transformation on a and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
-+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
-+{
-+ return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
-+}
-+
- // Assist in expanding the AES cipher key by computing steps towards generating
- // a round key for encryption cipher using data from a and an 8-bit round
- // constant specified in imm8, and store the result in dst."
-@@ -9783,7 +8895,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
-
- // Perform a carry-less multiplication of two 64-bit integers, selected from a
- // and b according to imm8, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
- FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
- {
- uint64x2_t a = vreinterpretq_u64_m128i(_a);
-@@ -9828,7 +8940,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
-
- // Count the number of bits set to 1 in unsigned 32-bit integer a, and
- // return that count in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
- FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
- {
- #if defined(__aarch64__)
-@@ -9855,7 +8967,7 @@ FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
-
- // Count the number of bits set to 1 in unsigned 64-bit integer a, and
- // return that count in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
- FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
- {
- #if defined(__aarch64__)
-@@ -9911,7 +9023,6 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
-
- // Return the current 64-bit value of the processor's time-stamp counter.
- // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
--
- FORCE_INLINE uint64_t _rdtsc(void)
- {
- #if defined(__aarch64__)
diff --git a/bazel/patches/emp-tool.patch b/bazel/patches/emp-tool.patch
deleted file mode 100644
index ac40006..0000000
--- a/bazel/patches/emp-tool.patch
+++ /dev/null
@@ -1,175 +0,0 @@
-diff --git a/emp-tool/utils/aes.h b/emp-tool/utils/aes.h
-index 0235544..75a8486 100644
---- a/emp-tool/utils/aes.h
-+++ b/emp-tool/utils/aes.h
-@@ -54,6 +54,10 @@
-
- #include "emp-tool/utils/block.h"
-
-+#ifdef __aarch64__
-+#include "emp-tool/utils/sse2neon.h"
-+#endif
-+
- namespace emp {
-
- typedef struct { block rd_key[11]; unsigned int rounds; } AES_KEY;
-@@ -103,6 +107,7 @@ AES_set_encrypt_key(const block userkey, AES_KEY *key) {
-
- #ifdef __x86_64__
- __attribute__((target("aes,sse2")))
-+#endif
- inline void AES_ecb_encrypt_blks(block *blks, unsigned int nblks, const AES_KEY *key) {
- for (unsigned int i = 0; i < nblks; ++i)
- blks[i] = _mm_xor_si128(blks[i], key->rd_key[0]);
-@@ -112,22 +117,6 @@ inline void AES_ecb_encrypt_blks(block *blks, unsigned int nblks, const AES_KEY
- for (unsigned int i = 0; i < nblks; ++i)
- blks[i] = _mm_aesenclast_si128(blks[i], key->rd_key[key->rounds]);
- }
--#elif __aarch64__
--inline void AES_ecb_encrypt_blks(block *_blks, unsigned int nblks, const AES_KEY *key) {
-- uint8x16_t * blks = (uint8x16_t*)(_blks);
-- uint8x16_t * keys = (uint8x16_t*)(key->rd_key);
-- auto * first = blks;
-- for (unsigned int j = 0; j < key->rounds-1; ++j) {
-- uint8x16_t key_j = (uint8x16_t)keys[j];
-- blks = first;
-- for (unsigned int i = 0; i < nblks; ++i, ++blks)
-- *blks = vaesmcq_u8(vaeseq_u8(*blks, key_j));
-- }
-- uint8x16_t last_key = (uint8x16_t)keys[key->rounds-1];
-- for (unsigned int i = 0; i < nblks; ++i, ++first)
-- *first = vaeseq_u8(*first, last_key) ^ (uint8x16_t)keys[key->rounds];
--}
--#endif
-
- #ifdef __GNUC__
- #ifndef __clang__
-diff --git a/emp-tool/utils/aes_opt.h b/emp-tool/utils/aes_opt.h
-index 2594e32..6a78b75 100644
---- a/emp-tool/utils/aes_opt.h
-+++ b/emp-tool/utils/aes_opt.h
-@@ -58,7 +58,6 @@ static inline void AES_opt_key_schedule(block* user_key, AES_KEY *keys) {
- /*
- * With numKeys keys, use each key to encrypt numEncs blocks.
- */
--#ifdef __x86_64__
- template
- static inline void ParaEnc(block *blks, AES_KEY *keys) {
- block * first = blks;
-@@ -90,29 +89,6 @@ static inline void ParaEnc(block *blks, AES_KEY *keys) {
- }
- }
- }
--#elif __aarch64__
--template
--static inline void ParaEnc(block *_blks, AES_KEY *keys) {
-- uint8x16_t * first = (uint8x16_t*)(_blks);
--
-- for (unsigned int r = 0; r < 9; ++r) {
-- auto blks = first;
-- for(size_t i = 0; i < numKeys; ++i) {
-- uint8x16_t K = vreinterpretq_u8_m128i(keys[i].rd_key[r]);
-- for(size_t j = 0; j < numEncs; ++j, ++blks)
-- *blks = vaesmcq_u8(vaeseq_u8(*blks, K));
-- }
-- }
--
-- auto blks = first;
-- for(size_t i = 0; i < numKeys; ++i) {
-- uint8x16_t K = vreinterpretq_u8_m128i(keys[i].rd_key[9]);
-- uint8x16_t K2 = vreinterpretq_u8_m128i(keys[i].rd_key[10]);
-- for(size_t j = 0; j < numEncs; ++j, ++blks)
-- *blks = vaeseq_u8(*blks, K) ^ K2;
-- }
--}
--#endif
-
- }
- #endif
-diff --git a/emp-tool/utils/block.h b/emp-tool/utils/block.h
-index f7d3d34..fcc21c1 100644
---- a/emp-tool/utils/block.h
-+++ b/emp-tool/utils/block.h
-@@ -5,16 +5,7 @@
- #include
- #elif __aarch64__
- #include "sse2neon.h"
--inline __m128i _mm_aesimc_si128(__m128i a) {
-- return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
--}
--
--inline __m128i _mm_aesdeclast_si128 (__m128i a, __m128i RoundKey)
--{
-- return vreinterpretq_m128i_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)) ^ vreinterpretq_u8_m128i(RoundKey));
--}
- #endif
--
- #include
- #include
- #include
-diff --git a/emp-tool/utils/f2k.h b/emp-tool/utils/f2k.h
-index 7fe1b1b..f6186a1 100644
---- a/emp-tool/utils/f2k.h
-+++ b/emp-tool/utils/f2k.h
-@@ -6,6 +6,7 @@ namespace emp {
- /* multiplication in galois field without reduction */
- #ifdef __x86_64__
- __attribute__((target("sse2,pclmul")))
-+ #endif
- inline void mul128(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
- __m128i tmp3, tmp4, tmp5, tmp6;
- tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
-@@ -22,28 +23,6 @@ namespace emp {
- *res1 = tmp3;
- *res2 = tmp6;
- }
-- #elif __aarch64__
-- inline void mul128(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
-- __m128i tmp3, tmp4, tmp5, tmp6;
-- poly64_t a_lo = (poly64_t)vget_low_u64(vreinterpretq_u64_m128i(a));
-- poly64_t a_hi = (poly64_t)vget_high_u64(vreinterpretq_u64_m128i(a));
-- poly64_t b_lo = (poly64_t)vget_low_u64(vreinterpretq_u64_m128i(b));
-- poly64_t b_hi = (poly64_t)vget_high_u64(vreinterpretq_u64_m128i(b));
-- tmp3 = (__m128i)vmull_p64(a_lo, b_lo);
-- tmp4 = (__m128i)vmull_p64(a_hi, b_lo);
-- tmp5 = (__m128i)vmull_p64(a_lo, b_hi);
-- tmp6 = (__m128i)vmull_p64(a_hi, b_hi);
--
-- tmp4 = _mm_xor_si128(tmp4, tmp5);
-- tmp5 = _mm_slli_si128(tmp4, 8);
-- tmp4 = _mm_srli_si128(tmp4, 8);
-- tmp3 = _mm_xor_si128(tmp3, tmp5);
-- tmp6 = _mm_xor_si128(tmp6, tmp4);
-- // initial mul now in tmp3, tmp6
-- *res1 = tmp3;
-- *res2 = tmp6;
-- }
-- #endif
-
- /* multiplication in galois field with reduction */
- #ifdef __x86_64__
-diff --git a/emp-tool/utils/prg.h b/emp-tool/utils/prg.h
-index 23bbf42..5101d7e 100644
---- a/emp-tool/utils/prg.h
-+++ b/emp-tool/utils/prg.h
-@@ -82,7 +82,7 @@ class PRG { public:
- } else {
- block tmp[2];
- random_block(tmp, 2);
-- memcpy(data, tmp, nbytes);
-+ memcpy(data, tmp, nbytes <= 32? nbytes : 32);
- }
- }
-
-diff --git a/emp-tool/utils/block.h b/emp-tool/utils/block.h
-index f7d3d34..3c25a73 100644
---- a/emp-tool/utils/block.h
-+++ b/emp-tool/utils/block.h
-@@ -19,6 +19,7 @@ inline __m128i _mm_aesdeclast_si128 (__m128i a, __m128i RoundKey)
- #include
- #include
- #include
-+#include
-
- namespace emp {
-
diff --git a/bazel/patches/flatbuffers.patch b/bazel/patches/flatbuffers.patch
deleted file mode 100644
index 6f9c4ee..0000000
--- a/bazel/patches/flatbuffers.patch
+++ /dev/null
@@ -1,306 +0,0 @@
-diff --git a/grpc/BUILD.bazel b/grpc/BUILD.bazel
-deleted file mode 100644
-index e69de29b..00000000
-diff --git a/grpc/src/compiler/BUILD.bazel b/grpc/src/compiler/BUILD.bazel
-deleted file mode 100644
-index 0efa9560..00000000
---- a/grpc/src/compiler/BUILD.bazel
-+++ /dev/null
-@@ -1,131 +0,0 @@
--load("@rules_cc//cc:defs.bzl", "cc_library")
--
--package(
-- default_visibility = ["//visibility:public"],
--)
--
--filegroup(
-- name = "distribution",
-- srcs = [
-- "BUILD.bazel",
-- ] + glob([
-- "*.cc",
-- "*.h",
-- ]),
--)
--
--filegroup(
-- name = "common_headers",
-- srcs = [
-- "schema_interface.h",
-- ],
--)
--
--cc_library(
-- name = "cpp_generator",
-- srcs = [
-- "cpp_generator.cc",
-- ],
-- hdrs = [
-- "cpp_generator.h",
-- ":common_headers",
-- ],
-- include_prefix = "src/compiler",
-- strip_include_prefix = "/grpc/src/compiler",
-- deps = [
-- "//:flatbuffers",
-- ],
--)
--
--cc_library(
-- name = "go_generator",
-- srcs = [
-- "go_generator.cc",
-- ],
-- hdrs = [
-- "go_generator.h",
-- ":common_headers",
-- ],
-- include_prefix = "src/compiler",
-- strip_include_prefix = "/grpc/src/compiler",
-- deps = [
-- "//:flatbuffers",
-- ],
--)
--
--cc_library(
-- name = "java_generator",
-- srcs = [
-- "java_generator.cc",
-- ],
-- hdrs = [
-- "java_generator.h",
-- ":common_headers",
-- ],
-- include_prefix = "src/compiler",
-- strip_include_prefix = "/grpc/src/compiler",
-- deps = [
-- "//:flatbuffers",
-- ],
--)
--
--cc_library(
-- name = "python_generator",
-- hdrs = [
-- "python_generator.h",
-- ],
-- include_prefix = "src/compiler",
-- strip_include_prefix = "/grpc/src/compiler",
-- deps = [
-- ":python_generator_private",
-- ],
--)
--
--cc_library(
-- name = "python_generator_private",
-- srcs = [
-- "python_generator.cc",
-- ],
-- hdrs = [
-- "python_generator.h",
-- ":common_headers",
-- ],
-- include_prefix = "src/compiler",
-- strip_include_prefix = "/grpc/src/compiler",
-- visibility = ["//visibility:private"],
-- deps = [
-- "//:flatbuffers",
-- ],
--)
--
--cc_library(
-- name = "swift_generator",
-- srcs = [
-- "swift_generator.cc",
-- ],
-- hdrs = [
-- "swift_generator.h",
-- ":common_headers",
-- ],
-- include_prefix = "src/compiler",
-- strip_include_prefix = "/grpc/src/compiler",
-- deps = [
-- "//:flatbuffers",
-- ],
--)
--
--cc_library(
-- name = "ts_generator",
-- srcs = [
-- "ts_generator.cc",
-- ],
-- hdrs = [
-- "ts_generator.h",
-- ":common_headers",
-- ],
-- include_prefix = "src/compiler",
-- strip_include_prefix = "/grpc/src/compiler",
-- deps = [
-- "//:flatbuffers",
-- ],
--)
-diff --git a/src/BUILD.bazel b/src/BUILD.bazel
-deleted file mode 100644
-index 679b10f7..00000000
---- a/src/BUILD.bazel
-+++ /dev/null
-@@ -1,159 +0,0 @@
--# @unused
--load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
--
--package(
-- default_visibility = ["//visibility:private"],
--)
--
--filegroup(
-- name = "distribution",
-- srcs = [
-- "BUILD.bazel",
-- ] + glob([
-- "*.cpp",
-- "*.h",
-- ]),
-- visibility = ["//visibility:public"],
--)
--
--cc_library(
-- name = "code_generators",
-- srcs = ["code_generators.cpp"],
-- hdrs = [
-- "//:public_headers",
-- ],
-- strip_include_prefix = "/include",
-- visibility = ["//:__subpackages__"],
--)
--
--cc_library(
-- name = "generate_fbs",
-- srcs = ["idl_gen_fbs.cpp"],
-- hdrs = ["idl_gen_fbs.h"],
-- strip_include_prefix = "/src",
-- visibility = ["//:__subpackages__"],
-- deps = [":code_generators"],
--)
--
--# Public flatc library to compile flatbuffer files at runtime.
--cc_library(
-- name = "flatbuffers",
-- srcs = [
-- "idl_gen_text.cpp",
-- "idl_gen_text.h",
-- "idl_parser.cpp",
-- "reflection.cpp",
-- "util.cpp",
-- ],
-- hdrs = [
-- "//:public_headers",
-- ],
-- linkopts = select({
-- # TODO: Bazel uses `clang` instead of `clang++` to link
-- # C++ code on BSD. Temporarily adding these linker flags while
-- # we wait for Bazel to resolve
-- # https://github.com/bazelbuild/bazel/issues/12023.
-- "//:platform_freebsd": ["-lm"],
-- "//:platform_openbsd": ["-lm"],
-- "//conditions:default": [],
-- }),
-- strip_include_prefix = "/include",
-- visibility = ["//:__subpackages__"],
-- deps = [
-- ":code_generators",
-- ":generate_fbs",
-- ],
--)
--
--# Public flatc compiler library.
--cc_library(
-- name = "flatc_library",
-- srcs = [
-- "annotated_binary_text_gen.cpp",
-- "annotated_binary_text_gen.h",
-- "bfbs_gen.h",
-- "bfbs_gen_lua.cpp",
-- "bfbs_gen_lua.h",
-- "bfbs_gen_nim.cpp",
-- "bfbs_gen_nim.h",
-- "bfbs_namer.h",
-- "binary_annotator.cpp",
-- "binary_annotator.h",
-- "flatc.cpp",
-- "namer.h",
-- ],
-- hdrs = [
-- "//:flatc_headers",
-- ],
-- strip_include_prefix = "/include",
-- visibility = ["//:__pkg__"],
-- deps = [
-- ":flatbuffers",
-- ],
--)
--
--# Public flatc compiler.
--cc_library(
-- name = "flatc",
-- srcs = [
-- "bfbs_gen.h",
-- "bfbs_gen_lua.cpp",
-- "bfbs_gen_lua.h",
-- "bfbs_gen_nim.cpp",
-- "bfbs_gen_nim.h",
-- "bfbs_namer.h",
-- "file_binary_writer.cpp",
-- "file_name_saving_file_manager.cpp",
-- "file_writer.cpp",
-- "flatc_main.cpp",
-- "idl_gen_binary.cpp",
-- "idl_gen_binary.h",
-- "idl_gen_cpp.cpp",
-- "idl_gen_cpp.h",
-- "idl_gen_csharp.cpp",
-- "idl_gen_csharp.h",
-- "idl_gen_dart.cpp",
-- "idl_gen_dart.h",
-- "idl_gen_go.cpp",
-- "idl_gen_go.h",
-- "idl_gen_grpc.cpp",
-- "idl_gen_java.cpp",
-- "idl_gen_java.h",
-- "idl_gen_json_schema.cpp",
-- "idl_gen_json_schema.h",
-- "idl_gen_kotlin.cpp",
-- "idl_gen_kotlin.h",
-- "idl_gen_kotlin_kmp.cpp",
-- "idl_gen_lobster.cpp",
-- "idl_gen_lobster.h",
-- "idl_gen_php.cpp",
-- "idl_gen_php.h",
-- "idl_gen_python.cpp",
-- "idl_gen_python.h",
-- "idl_gen_rust.cpp",
-- "idl_gen_rust.h",
-- "idl_gen_swift.cpp",
-- "idl_gen_swift.h",
-- "idl_gen_text.cpp",
-- "idl_gen_text.h",
-- "idl_gen_ts.cpp",
-- "idl_gen_ts.h",
-- "idl_namer.h",
-- "namer.h",
-- "util.cpp",
-- ],
-- hdrs = [
-- "//:flatc_headers",
-- ],
-- strip_include_prefix = "/include",
-- visibility = ["//:__pkg__"],
-- deps = [
-- ":flatc_library",
-- "//grpc/src/compiler:cpp_generator",
-- "//grpc/src/compiler:go_generator",
-- "//grpc/src/compiler:java_generator",
-- "//grpc/src/compiler:python_generator",
-- "//grpc/src/compiler:swift_generator",
-- "//grpc/src/compiler:ts_generator",
-- ],
--)
---
diff --git a/bazel/patches/grpc-1.66.patch b/bazel/patches/grpc-1.66.patch
new file mode 100644
index 0000000..b6f82e5
--- /dev/null
+++ b/bazel/patches/grpc-1.66.patch
@@ -0,0 +1,20 @@
+diff --git a/third_party/BUILD b/third_party/BUILD
+index 77cb52d0fc..c4b647f5c9 100644
+--- a/third_party/BUILD
++++ b/third_party/BUILD
+@@ -18,13 +18,13 @@ package(default_visibility = ["//:__subpackages__"])
+
+ alias(
+ name = "libssl",
+- actual = "@boringssl//:ssl",
++ actual = "@openssl//:ssl",
+ tags = ["manual"],
+ )
+
+ alias(
+ name = "libcrypto",
+- actual = "@boringssl//:crypto",
++ actual = "@openssl//:crypto",
+ tags = ["manual"],
+ )
+
diff --git a/bazel/patches/grpc-module-file.patch b/bazel/patches/grpc-module-file.patch
new file mode 100644
index 0000000..29dc393
--- /dev/null
+++ b/bazel/patches/grpc-module-file.patch
@@ -0,0 +1,13 @@
+diff --git a/MODULE.bazel b/MODULE.bazel
+index 4a8fbe83..8650f678 100644
+--- a/MODULE.bazel
++++ b/MODULE.bazel
+@@ -8,7 +8,7 @@ module(
+ bazel_dep(name = "abseil-cpp", version = "20240116.0", repo_name = "com_google_absl")
+ bazel_dep(name = "apple_support", version = "1.15.1", repo_name = "build_bazel_apple_support")
+ bazel_dep(name = "bazel_skylib", version = "1.5.0")
+-bazel_dep(name = "boringssl", version = "0.0.0-20230215-5c22014")
++bazel_dep(name = "openssl", version = "3.3.2")
+ bazel_dep(name = "c-ares", version = "1.15.0", repo_name = "com_github_cares_cares")
+ bazel_dep(name = "gazelle", version = "0.36.0", repo_name = "bazel_gazelle")
+ bazel_dep(name = "google_benchmark", version = "1.8.4", repo_name = "com_github_google_benchmark")
\ No newline at end of file
diff --git a/bazel/patches/grpc.patch b/bazel/patches/grpc.patch
deleted file mode 100644
index fd8e09f..0000000
--- a/bazel/patches/grpc.patch
+++ /dev/null
@@ -1,32 +0,0 @@
-diff --git a/bazel/grpc_deps.bzl b/bazel/grpc_deps.bzl
-index 5e65a65df4..03bbd2361e 100644
---- a/bazel/grpc_deps.bzl
-+++ b/bazel/grpc_deps.bzl
-@@ -57,12 +57,12 @@ def grpc_deps():
-
- native.bind(
- name = "libssl",
-- actual = "@boringssl//:ssl",
-+ actual = "@com_github_openssl_openssl//:openssl",
- )
-
- native.bind(
- name = "libcrypto",
-- actual = "@boringssl//:crypto",
-+ actual = "@com_github_openssl_openssl//:openssl",
- )
-
- native.bind(
-diff --git a/src/core/lib/iomgr/tcp_posix.cc b/src/core/lib/iomgr/tcp_posix.cc
-index 72e1b6609e..aded52d0db 100644
---- a/src/core/lib/iomgr/tcp_posix.cc
-+++ b/src/core/lib/iomgr/tcp_posix.cc
-@@ -41,6 +41,8 @@
- #include
- #include
-
-+#include "absl/strings/str_cat.h"
-+
- #include
- #include
- #include
diff --git a/bazel/patches/ippcp.patch b/bazel/patches/ippcp.patch
deleted file mode 100644
index 0af05b2..0000000
--- a/bazel/patches/ippcp.patch
+++ /dev/null
@@ -1,250 +0,0 @@
-diff --git a/sources/cmake/linux/GNU8.2.0.cmake b/sources/cmake/linux/GNU8.2.0.cmake
-index 24d7e0f..15dd433 100644
---- a/sources/cmake/linux/GNU8.2.0.cmake
-+++ b/sources/cmake/linux/GNU8.2.0.cmake
-@@ -32,7 +32,7 @@ set(LINK_FLAG_DYNAMIC_LINUX "${LINK_FLAG_SECURITY} -nostdlib")
- # Dynamically link lib c (libdl is for old apps)
- set(LINK_FLAG_DYNAMIC_LINUX "${LINK_FLAG_DYNAMIC_LINUX} -Wl,-call_shared,-lc")
- # Create a shared library
--set(LINK_FLAG_DYNAMIC_LINUX "-Wl,-shared")
-+set(LINK_FLAG_DYNAMIC_LINUX "-Wl,-shared,-fuse-ld=bfd")
- if(${ARCH} MATCHES "ia32")
- # Tells the compiler to generate code for a specific architecture (32)
- set(LINK_FLAG_DYNAMIC_LINUX "${LINK_FLAG_DYNAMIC_LINUX} -m32")
-@@ -74,7 +74,7 @@ if ((${ARCH} MATCHES "ia32") OR (NOT NONPIC_LIB))
- endif()
-
- # Security flag that adds compile-time and run-time checks
--set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_FORTIFY_SOURCE=2")
-+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2")
-
- if(NOT NONPIC_LIB)
- # Position Independent Execution (PIE)
-@@ -95,6 +95,8 @@ if(${ARCH} MATCHES "ia32")
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m32")
- endif(${ARCH} MATCHES "ia32")
-
-+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function -Wno-unused-variable -Wno-unused-but-set-variable -Wno-unknown-pragmas -Wno-missing-braces -Wno-comment -Wno-strict-aliasing -Wno-parentheses -Wno-array-parameter")
-+
- # Optimization level = 3, no-debug definition (turns off asserts), warnings=errors
- set (CMAKE_C_FLAGS_RELEASE " -O3 -DNDEBUG -Werror")
-
-diff --git a/sources/cmake/macosx/AppleClang11.0.0.cmake b/sources/cmake/macosx/AppleClang11.0.0.cmake
-index 5b92877..ccb963e 100644
---- a/sources/cmake/macosx/AppleClang11.0.0.cmake
-+++ b/sources/cmake/macosx/AppleClang11.0.0.cmake
-@@ -20,12 +20,6 @@
-
- # Security Linker flags
- set(LINK_FLAG_SECURITY "")
--# Disallows undefined symbols in object files. Undefined symbols in shared libraries are still allowed
--set(LINK_FLAG_SECURITY "${LINK_FLAG_SECURITY} -Wl,-z,defs")
--# Stack execution protection
--set(LINK_FLAG_SECURITY "${LINK_FLAG_SECURITY} -Wl,-z,noexecstack")
--# Data relocation and protection (RELRO)
--set(LINK_FLAG_SECURITY "${LINK_FLAG_SECURITY} -Wl,-z,relro -Wl,-z,now")
- # Prevents the compiler from using standard libraries and startup files when linking.
- set(LINK_FLAG_DYNAMIC_MACOSX "${LINK_FLAG_SECURITY} -nostdlib")
- # Dynamically link lib c (libdl is for old apps)
-@@ -79,7 +73,7 @@ if ((${ARCH} MATCHES "ia32") OR (NOT NONPIC_LIB))
- endif()
-
- # Security flag that adds compile-time and run-time checks
--set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_FORTIFY_SOURCE=2")
-+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2")
-
- if(NOT NONPIC_LIB)
- # Position Independent Execution (PIE)
-@@ -98,6 +92,8 @@ if(${ARCH} MATCHES "ia32")
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m32")
- endif(${ARCH} MATCHES "ia32")
-
-+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-missing-braces -Wno-unused-command-line-argument -Wno-unused-but-set-variable -Wno-unknown-warning-option")
-+
- # Optimization level = 3, no-debug definition (turns off asserts), warnings=errors
- set (CMAKE_C_FLAGS_RELEASE " -O3 -DNDEBUG -Werror")
-
-@@ -115,3 +111,5 @@ set(l9_opt "${l9_opt} -march=haswell -mavx2 -maes -mpclmul -msha -mrdrnd -mrdsee
- set(n0_opt "${n0_opt} -march=knl -mavx2 -maes -mavx512f -mavx512cd -mavx512pf -mavx512er -mpclmul -msha -mrdrnd -mrdseed")
- set(k0_opt "${k0_opt} -march=skylake-avx512")
- set(k0_opt "${k0_opt} -maes -mavx512f -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mpclmul -msha -mrdrnd -mrdseed -madx -mgfni -mvaes -mvpclmulqdq -mavx512vbmi -mavx512vbmi2")
-+set(k1_opt "${k1_opt} -march=skylake-avx512")
-+set(k1_opt "${k1_opt} -maes -mavx512f -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mpclmul -msha -mrdrnd -mrdseed -madx -mgfni -mvaes -mvpclmulqdq -mavx512vbmi -mavx512vbmi2")
-diff --git a/sources/cmake/macosx/common.cmake b/sources/cmake/macosx/common.cmake
-index 85ec3ad..67bb9f9 100644
---- a/sources/cmake/macosx/common.cmake
-+++ b/sources/cmake/macosx/common.cmake
-@@ -18,7 +18,7 @@
- # Intel® Integrated Performance Primitives Cryptography (Intel® IPP Cryptography)
- #
-
--set(OS_DEFAULT_COMPILER Intel19.0.0)
-+set(OS_DEFAULT_COMPILER AppleClang11.0.0)
-
- set(LIBRARY_DEFINES "${LIBRARY_DEFINES} -DIPP_PIC -DOSXEM64T -DLINUX32E -D_ARCH_EM64T")
- #set(LIBRARY_DEFINES "${LIBRARY_DEFINES} -DBN_OPENSSL_DISABLE")
-\ No newline at end of file
-diff --git a/sources/ippcp/crypto_mb/src/cmake/linux/GNU.cmake b/sources/ippcp/crypto_mb/src/cmake/linux/GNU.cmake
-index a2abeeb..67aca8b 100644
---- a/sources/ippcp/crypto_mb/src/cmake/linux/GNU.cmake
-+++ b/sources/ippcp/crypto_mb/src/cmake/linux/GNU.cmake
-@@ -31,7 +31,7 @@ set(CMAKE_C_FLAGS_SECURITY "${CMAKE_C_FLAGS_SECURITY} -Wformat -Wformat-security
- if(${CMAKE_BUILD_TYPE} STREQUAL "Release")
- if(NOT DEFINED NO_FORTIFY_SOURCE)
- # Security flag that adds compile-time and run-time checks.
-- set(CMAKE_C_FLAGS_SECURITY "${CMAKE_C_FLAGS_SECURITY} -D_FORTIFY_SOURCE=2")
-+ set(CMAKE_C_FLAGS_SECURITY "${CMAKE_C_FLAGS_SECURITY} -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2")
- endif()
- endif()
-
-@@ -51,7 +51,7 @@ set(CMAKE_C_FLAGS_SECURITY "${CMAKE_C_FLAGS_SECURITY} -Werror")
- # Linker flags
-
- # Create shared library
--set(LINK_FLAGS_DYNAMIC " -Wl,-shared")
-+set(LINK_FLAGS_DYNAMIC " -Wl,-shared,-fuse-ld=bfd")
- # Add export files
- set(LINK_FLAGS_DYNAMIC "${LINK_FLAGS_DYNAMIC} ${CRYPTO_MB_SOURCES_DIR}/cmake/dll_export/crypto_mb.linux.lib-export")
-
-@@ -69,6 +69,7 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
-
- # Suppress warnings from casts from a pointer to an integer type of a different size
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pointer-to-int-cast")
-+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function -Wno-unused-variable -Wno-unused-but-set-variable -Wno-unknown-pragmas -Wno-missing-braces -Wno-comment -Wno-strict-aliasing -Wno-parentheses")
-
- # Optimization level = 3, no-debug definition (turns off asserts)
- set(CMAKE_C_FLAGS_RELEASE " -O3 -DNDEBUG")
-diff --git a/sources/ippcp/crypto_mb/src/cmake/macosx/AppleClang.cmake b/sources/ippcp/crypto_mb/src/cmake/macosx/AppleClang.cmake
-index ea1641d..f98fc2d 100644
---- a/sources/ippcp/crypto_mb/src/cmake/macosx/AppleClang.cmake
-+++ b/sources/ippcp/crypto_mb/src/cmake/macosx/AppleClang.cmake
-@@ -17,10 +17,6 @@
- # Security Linker flags
-
- set(LINK_FLAG_SECURITY "")
--# Data relocation and protection (RELRO)
--set(LINK_FLAG_SECURITY "${LINK_FLAG_SECURITY} -Wl,-z,relro -Wl,-z,now")
--# Stack execution protection
--set(LINK_FLAG_SECURITY "${LINK_FLAG_SECURITY} -Wl,-z,noexecstack")
-
- # Security Compiler flags
-
-@@ -30,7 +26,7 @@ set(CMAKE_C_FLAGS_SECURITY "${CMAKE_C_FLAGS_SECURITY} -Wformat -Wformat-security
-
- if(${CMAKE_BUILD_TYPE} STREQUAL "Release")
- # Security flag that adds compile-time and run-time checks.
-- set(CMAKE_C_FLAGS_SECURITY "${CMAKE_C_FLAGS_SECURITY} -D_FORTIFY_SOURCE=2")
-+ set(CMAKE_C_FLAGS_SECURITY "${CMAKE_C_FLAGS_SECURITY} -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2")
- endif()
-
- # Stack-based Buffer Overrun Detection
-@@ -65,6 +61,8 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
- # Suppress warnings from casts from a pointer to an integer type of a different size
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pointer-to-int-cast")
-
-+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-missing-braces -Wno-unknown-warning-option")
-+
- # Optimization level = 3, no-debug definition (turns off asserts)
- set(CMAKE_C_FLAGS_RELEASE " -O3 -DNDEBUG")
- set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
-
-diff --git a/sources/cmake/linux/Clang9.0.0.cmake b/sources/cmake/linux/Clang9.0.0.cmake
-index 0015431..f93411c 100644
---- a/sources/cmake/linux/Clang9.0.0.cmake
-+++ b/sources/cmake/linux/Clang9.0.0.cmake
-@@ -79,7 +79,7 @@ endif()
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fcf-protection=full")
-
- # Security flag that adds compile-time and run-time checks
--set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_FORTIFY_SOURCE=2")
-+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-
- if(NOT NONPIC_LIB)
- # Position Independent Execution (PIE)
-@@ -107,7 +107,7 @@ if(SANITIZERS)
- endif(SANITIZERS)
-
- # Optimization level = 3, no-debug definition (turns off asserts), warnings=errors
--set (CMAKE_C_FLAGS_RELEASE " -O3 -DNDEBUG -Werror")
-+set (CMAKE_C_FLAGS_RELEASE " -O3 -DNDEBUG -Werror -Wno-unused-function -Wno-missing-braces -Wno-unused-but-set-variable -Wno-unknown-pragmas")
-
- # DEBUG flags - optimization level = 0, generation GDB information (-g)
- set (CMAKE_C_FLAGS_DEBUG " -O0 -g")
-
-diff --git a/sources/include/dispatcher.h b/sources/include/dispatcher.h
-index 8290df6..a2f93d7 100644
---- a/sources/include/dispatcher.h
-+++ b/sources/include/dispatcher.h
-@@ -92,9 +92,13 @@ extern "C" {
- #define LIB_W7 LIB_S8
- #elif defined( _ARCH_EM64T ) && !defined( OSXEM64T ) && !defined( WIN32E ) /* Linux* OS Intel64 supports N0 */
- enum lib_enum {
-- LIB_M7=0, LIB_N8=1, LIB_Y8=2, LIB_E9=3, LIB_L9=4, LIB_N0=5, LIB_K0=6, LIB_K1=7,LIB_NOMORE
-+ LIB_E9=0, LIB_L9=1, LIB_K0=2, LIB_K1=3,LIB_NOMORE
- };
-- #define LIB_PX LIB_M7
-+ #define LIB_PX LIB_E9
-+ #define LIB_M7 LIB_E9
-+ #define LIB_N8 LIB_E9
-+ #define LIB_Y8 LIB_E9
-+ #define LIB_N0 LIB_L9
- #elif defined( _ARCH_EM64T ) && !defined( OSXEM64T ) /* Windows* OS Intel64 doesn't support N0 */
- enum lib_enum {
- LIB_M7=0, LIB_N8=1, LIB_Y8=2, LIB_E9=3, LIB_L9=4, LIB_K0=5, LIB_K1=6, LIB_NOMORE
-@@ -103,11 +107,12 @@ extern "C" {
- #define LIB_N0 LIB_L9
- #elif defined( OSXEM64T )
- enum lib_enum {
-- LIB_Y8=0, LIB_E9=1, LIB_L9=2, LIB_K0=3, LIB_K1=4, LIB_NOMORE
-+ LIB_E9=0, LIB_L9=1, LIB_K0=2, LIB_K1=3, LIB_NOMORE
- };
-- #define LIB_PX LIB_Y8
-- #define LIB_M7 LIB_Y8
-- #define LIB_N8 LIB_Y8
-+ #define LIB_PX LIB_E9
-+ #define LIB_M7 LIB_E9
-+ #define LIB_N8 LIB_E9
-+ #define LIB_Y8 LIB_E9
- #define LIB_N0 LIB_L9
- #elif defined( _ARCH_LRB2 )
- enum lib_enum {
-diff --git a/sources/include/owndefs.h b/sources/include/owndefs.h
-index dcc1ede..7c1e93e 100644
---- a/sources/include/owndefs.h
-+++ b/sources/include/owndefs.h
-@@ -632,14 +632,14 @@ extern double __intel_castu64_f64(unsigned __int64 val);
-
- #elif defined(linux)
- /* LIN-32, LIN-64 */
-- #if ( defined(_W7) || defined(_M7) )
-+ #if ( defined(_W7) || defined(_E9) )
- #define _IPP_DATA 1
- #endif
-
-
- /* OSX-32, OSX-64 */
- #elif defined(OSX32) || defined(OSXEM64T)
-- #if ( defined(_Y8) )
-+ #if ( defined(_E9) )
- #define _IPP_DATA 1
- #endif
- #endif
-diff --git a/sources/ippcp/CMakeLists.txt b/sources/ippcp/CMakeLists.txt
-index 315d1a3..8b11c7a 100644
---- a/sources/ippcp/CMakeLists.txt
-+++ b/sources/ippcp/CMakeLists.txt
-@@ -40,12 +40,12 @@ if(WIN32)
- endif(WIN32)
- if(UNIX)
- if(APPLE)
-- set(BASE_PLATFORM_LIST ${BASE_PLATFORM_LIST} y8 e9 l9 k0 k1)
-+ set(BASE_PLATFORM_LIST ${BASE_PLATFORM_LIST} e9 l9 k0 k1)
- else()
- if (${ARCH} MATCHES "ia32")
- set(BASE_PLATFORM_LIST ${BASE_PLATFORM_LIST} w7 s8 p8 g9 h9)
- else()
-- set(BASE_PLATFORM_LIST ${BASE_PLATFORM_LIST} m7 n8 y8 e9 l9 n0 k0 k1)
-+ set(BASE_PLATFORM_LIST ${BASE_PLATFORM_LIST} e9 l9 k0 k1)
- endif(${ARCH} MATCHES "ia32")
- endif(APPLE)
- endif(UNIX)
diff --git a/bazel/patches/upb.patch b/bazel/patches/upb.patch
deleted file mode 100644
index da3b828..0000000
--- a/bazel/patches/upb.patch
+++ /dev/null
@@ -1,29 +0,0 @@
-diff --git a/bazel/build_defs.bzl b/bazel/build_defs.bzl
-index b5bc64f0..dc30a75f 100644
---- a/bazel/build_defs.bzl
-+++ b/bazel/build_defs.bzl
-@@ -38,7 +38,7 @@ _DEFAULT_CPPOPTS.extend([
- "-Wno-long-long",
- ])
- _DEFAULT_COPTS.extend([
-- "-std=c99",
-+ "-std=c11",
- "-pedantic",
- "-Werror=pedantic",
- "-Wall",
-diff --git a/upb/port_def.inc b/upb/port_def.inc
-index 92e4bf24..e355ace7 100644
---- a/upb/port_def.inc
-+++ b/upb/port_def.inc
-@@ -92,7 +92,11 @@
- #define UPB_ALIGN_UP(size, align) (((size) + (align) - 1) / (align) * (align))
- #define UPB_ALIGN_DOWN(size, align) ((size) / (align) * (align))
- #define UPB_ALIGN_MALLOC(size) UPB_ALIGN_UP(size, UPB_MALLOC_ALIGN)
-+#ifdef __clang__
-+#define UPB_ALIGN_OF(type) _Alignof(type)
-+#else
- #define UPB_ALIGN_OF(type) offsetof (struct { char c; type member; }, member)
-+#endif
-
- /* Hints to the compiler about likely/unlikely branches. */
- #if defined (__GNUC__) || defined(__clang__)
diff --git a/bazel/psi.bzl b/bazel/psi.bzl
index 50f0151..ae8a957 100644
--- a/bazel/psi.bzl
+++ b/bazel/psi.bzl
@@ -17,7 +17,7 @@ warpper bazel cc_xx to modify flags.
"""
load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
-load("@yacl//bazel:yacl.bzl", "OMP_CFLAGS", "OMP_DEPS", "OMP_LINKFLAGS", "yacl_cmake_external")
+load("@yacl//bazel:yacl.bzl", "OMP_CFLAGS", "OMP_DEPS", "OMP_LINKFLAGS")
WARNING_FLAGS = [
"-Wall",
@@ -55,7 +55,7 @@ def psi_cc_library(
linkopts = linkopts + OMP_LINKFLAGS,
copts = _psi_copts() + copts + OMP_CFLAGS,
deps = deps + [
- "@com_github_gabime_spdlog//:spdlog",
+ "@spdlog//:spdlog",
] + OMP_DEPS,
**kargs
)
@@ -85,8 +85,8 @@ def psi_cc_test(
# -lm for tcmalloc
linkopts = linkopts + ["-lm", "-ldl"],
copts = _psi_copts() + copts,
- deps = deps + [
+ deps = [
"@com_google_googletest//:gtest_main",
- ],
+ ] + deps,
**kwargs
)
diff --git a/bazel/repositories.bzl b/bazel/repositories.bzl
index a2e01d5..48e8d6a 100644
--- a/bazel/repositories.bzl
+++ b/bazel/repositories.bzl
@@ -16,63 +16,26 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
def psi_deps():
- _com_github_nelhage_rules_boost()
- _bazel_platform()
- _upb()
- _com_github_emptoolkit_emp_tool()
_com_github_facebook_zstd()
_com_github_microsoft_seal()
_com_github_microsoft_apsi()
_com_github_microsoft_gsl()
_com_github_microsoft_kuku()
_com_google_flatbuffers()
- _org_apache_arrow()
- _com_github_grpc_grpc()
- _com_github_tencent_rapidjson()
- _com_github_xtensor_xsimd()
- _brotli()
- _com_github_lz4_lz4()
- _org_apache_thrift()
- _com_google_double_conversion()
- _bzip2()
- _com_github_google_snappy()
+
_com_github_google_perfetto()
_com_github_floodyberry_curve25519_donna()
- _com_github_ridiculousfish_libdivide()
- _com_github_sparsehash_sparsehash()
- _com_github_intel_ipp()
- _yacl()
+
_com_github_zeromq_cppzmq()
_com_github_zeromq_libzmq()
_com_github_log4cplus_log4cplus()
_com_github_open_source_parsers_jsoncpp()
-def _yacl():
- maybe(
- http_archive,
- name = "yacl",
- urls = [
- "https://github.com/secretflow/yacl/archive/refs/tags/0.4.5b5_nightly_20240913.tar.gz",
- ],
- strip_prefix = "yacl-0.4.5b5_nightly_20240913",
- sha256 = "04b332246e3ccb57b5dd612353ed2e84f894e5537a3e854c020c8172793c07d6",
- )
-
-def _bazel_platform():
- http_archive(
- name = "platforms",
- urls = [
- "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.8/platforms-0.0.8.tar.gz",
- "https://github.com/bazelbuild/platforms/releases/download/0.0.8/platforms-0.0.8.tar.gz",
- ],
- sha256 = "8150406605389ececb6da07cbcb509d5637a3ab9a24bc69b1101531367d89d74",
- )
-
def _com_github_facebook_zstd():
maybe(
http_archive,
- name = "com_github_facebook_zstd",
- build_file = "@psi//bazel:zstd.BUILD",
+ name = "zstd",
+ build_file = "//bazel:zstd.BUILD",
strip_prefix = "zstd-1.5.5",
sha256 = "98e9c3d949d1b924e28e01eccb7deed865eefebf25c2f21c702e5cd5b63b85e1",
type = ".tar.gz",
@@ -81,61 +44,10 @@ def _com_github_facebook_zstd():
],
)
-def _upb():
- maybe(
- http_archive,
- name = "upb",
- sha256 = "017a7e8e4e842d01dba5dc8aa316323eee080cd1b75986a7d1f94d87220e6502",
- strip_prefix = "upb-e4635f223e7d36dfbea3b722a4ca4807a7e882e2",
- urls = [
- "https://storage.googleapis.com/grpc-bazel-mirror/github.com/protocolbuffers/upb/archive/e4635f223e7d36dfbea3b722a4ca4807a7e882e2.tar.gz",
- "https://github.com/protocolbuffers/upb/archive/e4635f223e7d36dfbea3b722a4ca4807a7e882e2.tar.gz",
- ],
- patch_args = ["-p1"],
- patches = [
- "@psi//bazel/patches:upb.patch",
- ],
- )
-
-def _com_github_emptoolkit_emp_tool():
- maybe(
- http_archive,
- name = "com_github_emptoolkit_emp_tool",
- sha256 = "b9ab2380312e78020346b5d2db3d0244c7bd8098cb50f8b3620532ef491808d0",
- strip_prefix = "emp-tool-0.2.5",
- type = "tar.gz",
- patch_args = ["-p1"],
- patches = [
- "@psi//bazel/patches:emp-tool.patch",
- "@psi//bazel/patches:emp-tool-cmake.patch",
- "@psi//bazel/patches:emp-tool-sse2neon.patch",
- ],
- urls = [
- "https://github.com/emp-toolkit/emp-tool/archive/refs/tags/0.2.5.tar.gz",
- ],
- build_file = "@psi//bazel:emp-tool.BUILD",
- )
-
-def _com_github_intel_ipp():
- maybe(
- http_archive,
- name = "com_github_intel_ipp",
- sha256 = "d70f42832337775edb022ca8ac1ac418f272e791ec147778ef7942aede414cdc",
- strip_prefix = "cryptography-primitives-ippcp_2021.8",
- build_file = "@psi//bazel:ipp.BUILD",
- patch_args = ["-p1"],
- patches = [
- "@psi//bazel/patches:ippcp.patch",
- ],
- urls = [
- "https://github.com/intel/cryptography-primitives/archive/refs/tags/ippcp_2021.8.tar.gz",
- ],
- )
-
def _com_github_microsoft_seal():
maybe(
http_archive,
- name = "com_github_microsoft_seal",
+ name = "seal",
sha256 = "af9bf0f0daccda2a8b7f344f13a5692e0ee6a45fea88478b2b90c35648bf2672",
strip_prefix = "SEAL-4.1.1",
type = "tar.gz",
@@ -150,7 +62,7 @@ def _com_github_microsoft_seal():
def _com_github_microsoft_apsi():
maybe(
http_archive,
- name = "com_github_microsoft_apsi",
+ name = "apsi",
sha256 = "82c0f9329c79222675109d4a3682d204acd3ea9a724bcd98fa58eabe53851333",
strip_prefix = "APSI-0.11.0",
urls = [
@@ -183,7 +95,7 @@ def _com_github_microsoft_gsl():
def _com_github_microsoft_kuku():
maybe(
http_archive,
- name = "com_github_microsoft_kuku",
+ name = "kuku",
sha256 = "96ed5fad82ea8c8a8bb82f6eaf0b5dce744c0c2566b4baa11d8f5443ad1f83b7",
strip_prefix = "Kuku-2.1.0",
type = "tar.gz",
@@ -211,149 +123,10 @@ def _com_google_flatbuffers():
build_file = "@psi//bazel:flatbuffers.BUILD",
)
-def _org_apache_arrow():
- maybe(
- http_archive,
- name = "org_apache_arrow",
- urls = [
- "https://github.com/apache/arrow/archive/apache-arrow-10.0.0.tar.gz",
- ],
- sha256 = "2852b21f93ee84185a9d838809c9a9c41bf6deca741bed1744e0fdba6cc19e3f",
- strip_prefix = "arrow-apache-arrow-10.0.0",
- build_file = "@psi//bazel:arrow.BUILD",
- )
-
-def _com_github_grpc_grpc():
- maybe(
- http_archive,
- name = "com_github_grpc_grpc",
- sha256 = "7f42363711eb483a0501239fd5522467b31d8fe98d70d7867c6ca7b52440d828",
- strip_prefix = "grpc-1.51.0",
- type = "tar.gz",
- patch_args = ["-p1"],
- patches = ["@psi//bazel/patches:grpc.patch"],
- urls = [
- "https://github.com/grpc/grpc/archive/refs/tags/v1.51.0.tar.gz",
- ],
- )
-
-def _com_github_nelhage_rules_boost():
- # use boost 1.83
- RULES_BOOST_COMMIT = "cfa585b1b5843993b70aa52707266dc23b3282d0"
- maybe(
- http_archive,
- name = "com_github_nelhage_rules_boost",
- sha256 = "a7c42df432fae9db0587ff778d84f9dc46519d67a984eff8c79ae35e45f277c1",
- strip_prefix = "rules_boost-%s" % RULES_BOOST_COMMIT,
- patch_args = ["-p1"],
- patches = ["@psi//bazel/patches:boost.patch"],
- urls = [
- "https://github.com/nelhage/rules_boost/archive/%s.tar.gz" % RULES_BOOST_COMMIT,
- ],
- )
-
-def _com_github_tencent_rapidjson():
- maybe(
- http_archive,
- name = "com_github_tencent_rapidjson",
- urls = [
- "https://github.com/Tencent/rapidjson/archive/refs/tags/v1.1.0.tar.gz",
- ],
- sha256 = "bf7ced29704a1e696fbccf2a2b4ea068e7774fa37f6d7dd4039d0787f8bed98e",
- strip_prefix = "rapidjson-1.1.0",
- build_file = "@psi//bazel:rapidjson.BUILD",
- )
-
-def _com_github_xtensor_xsimd():
- maybe(
- http_archive,
- name = "com_github_xtensor_xsimd",
- urls = [
- "https://codeload.github.com/xtensor-stack/xsimd/tar.gz/refs/tags/8.1.0",
- ],
- sha256 = "d52551360d37709675237d2a0418e28f70995b5b7cdad7c674626bcfbbf48328",
- type = "tar.gz",
- strip_prefix = "xsimd-8.1.0",
- build_file = "@psi//bazel:xsimd.BUILD",
- )
-
-def _brotli():
- maybe(
- http_archive,
- name = "brotli",
- build_file = "@psi//bazel:brotli.BUILD",
- sha256 = "e720a6ca29428b803f4ad165371771f5398faba397edf6778837a18599ea13ff",
- strip_prefix = "brotli-1.1.0",
- urls = [
- "https://github.com/google/brotli/archive/refs/tags/v1.1.0.tar.gz",
- ],
- )
-
-def _com_github_lz4_lz4():
- maybe(
- http_archive,
- name = "com_github_lz4_lz4",
- urls = [
- "https://codeload.github.com/lz4/lz4/tar.gz/refs/tags/v1.9.3",
- ],
- sha256 = "030644df4611007ff7dc962d981f390361e6c97a34e5cbc393ddfbe019ffe2c1",
- type = "tar.gz",
- strip_prefix = "lz4-1.9.3",
- build_file = "@psi//bazel:lz4.BUILD",
- )
-
-def _org_apache_thrift():
- maybe(
- http_archive,
- name = "org_apache_thrift",
- build_file = "@psi//bazel:thrift.BUILD",
- sha256 = "31e46de96a7b36b8b8a457cecd2ee8266f81a83f8e238a9d324d8c6f42a717bc",
- strip_prefix = "thrift-0.21.0",
- urls = [
- "https://github.com/apache/thrift/archive/v0.21.0.tar.gz",
- ],
- )
-
-def _com_google_double_conversion():
- maybe(
- http_archive,
- name = "com_google_double_conversion",
- sha256 = "04ec44461850abbf33824da84978043b22554896b552c5fd11a9c5ae4b4d296e",
- strip_prefix = "double-conversion-3.3.0",
- build_file = "@psi//bazel:double-conversion.BUILD",
- urls = [
- "https://github.com/google/double-conversion/archive/refs/tags/v3.3.0.tar.gz",
- ],
- )
-
-def _bzip2():
- maybe(
- http_archive,
- name = "bzip2",
- build_file = "@psi//bazel:bzip2.BUILD",
- sha256 = "ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269",
- strip_prefix = "bzip2-1.0.8",
- urls = [
- "https://sourceware.org/pub/bzip2/bzip2-1.0.8.tar.gz",
- ],
- )
-
-def _com_github_google_snappy():
- maybe(
- http_archive,
- name = "com_github_google_snappy",
- urls = [
- "https://github.com/google/snappy/archive/refs/tags/1.1.9.tar.gz",
- ],
- sha256 = "75c1fbb3d618dd3a0483bff0e26d0a92b495bbe5059c8b4f1c962b478b6e06e7",
- strip_prefix = "snappy-1.1.9",
- build_file = "@psi//bazel:snappy.BUILD",
- )
-
def _com_github_google_perfetto():
maybe(
http_archive,
- name = "com_github_google_perfetto",
+ name = "perfetto",
urls = [
"https://github.com/google/perfetto/archive/refs/tags/v41.0.tar.gz",
],
@@ -367,7 +140,7 @@ def _com_github_google_perfetto():
def _com_github_floodyberry_curve25519_donna():
maybe(
http_archive,
- name = "com_github_floodyberry_curve25519_donna",
+ name = "curve25519-donna",
strip_prefix = "curve25519-donna-2fe66b65ea1acb788024f40a3373b8b3e6f4bbb2",
sha256 = "ba57d538c241ad30ff85f49102ab2c8dd996148456ed238a8c319f263b7b149a",
type = "tar.gz",
@@ -377,30 +150,6 @@ def _com_github_floodyberry_curve25519_donna():
],
)
-def _com_github_ridiculousfish_libdivide():
- maybe(
- http_archive,
- name = "com_github_ridiculousfish_libdivide",
- urls = [
- "https://github.com/ridiculousfish/libdivide/archive/refs/tags/5.0.tar.gz",
- ],
- sha256 = "01ffdf90bc475e42170741d381eb9cfb631d9d7ddac7337368bcd80df8c98356",
- strip_prefix = "libdivide-5.0",
- build_file = "@psi//bazel:libdivide.BUILD",
- )
-
-def _com_github_sparsehash_sparsehash():
- maybe(
- http_archive,
- name = "com_github_sparsehash_sparsehash",
- urls = [
- "https://github.com/sparsehash/sparsehash/archive/refs/tags/sparsehash-2.0.4.tar.gz",
- ],
- sha256 = "8cd1a95827dfd8270927894eb77f62b4087735cbede953884647f16c521c7e58",
- strip_prefix = "sparsehash-sparsehash-2.0.4",
- build_file = "@psi//bazel:sparsehash.BUILD",
- )
-
def _com_github_zeromq_cppzmq():
maybe(
http_archive,
diff --git a/bazel/seal.BUILD b/bazel/seal.BUILD
index e933044..92fc017 100644
--- a/bazel/seal.BUILD
+++ b/bazel/seal.BUILD
@@ -37,11 +37,11 @@ cmake(
# "SEAL_USE_INTEL_HEXL": "ON",
},
generate_args = ["-GNinja"],
- lib_source = "@com_github_microsoft_seal//:all",
+ lib_source = "@seal//:all",
out_include_dir = "include/SEAL-4.1",
out_static_libs = ["libseal-4.1.a"],
deps = [
- "@com_github_facebook_zstd//:zstd",
+ "@zstd",
"@com_github_microsoft_gsl//:Microsoft.GSL",
"@zlib",
# Uncomment to use hexl
diff --git a/bazel/snappy.BUILD b/bazel/snappy.BUILD
deleted file mode 100644
index 419b694..0000000
--- a/bazel/snappy.BUILD
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2023 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
- name = "all_srcs",
- srcs = glob(["**"]),
-)
-
-cmake(
- name = "snappy",
- cache_entries = {
- "SNAPPY_BUILD_TESTS": "OFF",
- "SNAPPY_BUILD_BENCHMARKS": "OFF",
- "CMAKE_INSTALL_LIBDIR": "lib",
- },
- generate_crosstool_file = False,
- install_args = [
- "--prefix $${INSTALLDIR}",
- ],
- lib_source = ":all_srcs",
- out_static_libs = [
- "libsnappy.a",
- ],
-)
diff --git a/bazel/thrift.BUILD b/bazel/thrift.BUILD
deleted file mode 100644
index 5d1cc9b..0000000
--- a/bazel/thrift.BUILD
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2023 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# copied from https://github.com/tensorflow/io/blob/v0.25.0/third_party/thrift.BUILD
-# Description:
-# Apache Thrift library
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"]) # Apache 2.0
-
-exports_files(["LICENSE"])
-
-cc_library(
- name = "thrift",
- srcs = glob([
- "lib/cpp/src/thrift/**/*.h",
- ]) + [
- "lib/cpp/src/thrift/protocol/TProtocol.cpp",
- "lib/cpp/src/thrift/transport/TBufferTransports.cpp",
- "lib/cpp/src/thrift/transport/TTransportException.cpp",
- ],
- hdrs = [
- "compiler/cpp/src/thrift/version.h",
- "lib/cpp/src/thrift/config.h",
- ],
- includes = [
- "lib/cpp/src",
- ],
- textual_hdrs = [
- "lib/cpp/src/thrift/protocol/TBinaryProtocol.tcc",
- "lib/cpp/src/thrift/protocol/TCompactProtocol.tcc",
- ],
- deps = [
- "@boost//:units",
- ],
-)
-
-genrule(
- name = "config_h",
- srcs = ["build/cmake/config.h.in"],
- outs = ["lib/cpp/src/thrift/config.h"],
- cmd = ("sed " +
- "-e 's/cmakedefine/define/g' " +
- "-e 's/$${PACKAGE}/thrift/g' " +
- "-e 's/$${PACKAGE_BUGREPORT}//g' " +
- "-e 's/$${PACKAGE_NAME}/thrift/g' " +
- "-e 's/$${PACKAGE_TARNAME}/thrift/g' " +
- "-e 's/$${PACKAGE_URL}//g' " +
- "-e 's/$${PACKAGE_VERSION}/0.12.0/g' " +
- "-e 's/$${PACKAGE_STRING}/thrift 0.12.0/g' " +
- "$< >$@"),
-)
diff --git a/bazel/xsimd.BUILD b/bazel/xsimd.BUILD
deleted file mode 100644
index 4c3361c..0000000
--- a/bazel/xsimd.BUILD
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2023 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# copied from https://github.com/tensorflow/io/blob/v0.25.0/third_party/xsimd.BUILD
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"]) # BSD 3-Clause
-
-exports_files(["LICENSE"])
-
-cc_library(
- name = "xsimd",
- srcs = [],
- hdrs = glob(
- [
- "include/xsimd/*.hpp",
- "include/xsimd/config/*.hpp",
- "include/xsimd/math/*.hpp",
- "include/xsimd/memory/*.hpp",
- "include/xsimd/stl/*.hpp",
- "include/xsimd/types/*.hpp",
- ],
- exclude = [
- ],
- ),
- copts = [],
- defines = [],
- includes = [
- "include",
- ],
- linkopts = [],
- visibility = ["//visibility:public"],
- deps = [
- ],
-)
diff --git a/bazel/zstd.BUILD b/bazel/zstd.BUILD
index 2efa3c6..bf77f9f 100644
--- a/bazel/zstd.BUILD
+++ b/bazel/zstd.BUILD
@@ -32,7 +32,7 @@ cmake(
"CMAKE_INSTALL_LIBDIR": "lib",
},
generate_args = ["-GNinja"],
- lib_source = "@com_github_facebook_zstd//:all",
+ lib_source = "@zstd//:all",
out_include_dir = "include/",
out_static_libs = ["libzstd.a"],
working_directory = "build/cmake",
diff --git a/benchmark/Makefile b/benchmark/Makefile
index 50569ce..7f80eaa 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -35,5 +35,5 @@ analysis:
docker logs $(DOCKER_PROJ_NAME)-psi-sender-1 > $(PWD)/docker-compose/logs/sender/psi.log
docker logs $(DOCKER_PROJ_NAME)-psi-receiver-1 > $(PWD)/docker-compose/logs/receiver/psi.log
-start-docker:
+start-docker:
@(cd $(PWD)/docker-compose && docker compose -p ${DOCKER_PROJ_NAME} up -d)
diff --git a/benchmark/docker-compose/setup_wan.sh b/benchmark/docker-compose/setup_wan.sh
index 3a34e27..cc4b1c0 100644
--- a/benchmark/docker-compose/setup_wan.sh
+++ b/benchmark/docker-compose/setup_wan.sh
@@ -3,3 +3,4 @@ set -eu
yum install iproute-tc -y;
tc qdisc add dev eth0 root handle 1: tbf rate 100mbit burst 128kb latency 10ms;
tc qdisc add dev eth0 parent 1:1 handle 10: netem delay 10msec limit 8000
+
diff --git a/benchmark/plot_csv_data.py b/benchmark/plot_csv_data.py
index 305a0b4..e33f2e7 100644
--- a/benchmark/plot_csv_data.py
+++ b/benchmark/plot_csv_data.py
@@ -21,7 +21,9 @@
def plot_cpu(docker_csv_path, output_path):
df1 = pd.read_csv(docker_csv_path)
- plt.plot(df1["running_time_s"], df1["cpu_percent"], marker="o", linestyle="-", color="b")
+ plt.plot(
+ df1["running_time_s"], df1["cpu_percent"], marker="o", linestyle="-", color="b"
+ )
max_time_count = 10
interval = 1
if len(df1) > max_time_count:
@@ -44,10 +46,13 @@ def plot_cpu(docker_csv_path, output_path):
plt.savefig(output_path)
plt.clf()
+
def plot_mem(docker_csv_path, output_path):
df1 = pd.read_csv(docker_csv_path)
- plt.plot(df1["running_time_s"], df1["mem_usage_MB"], marker="o", linestyle="-", color="b")
+ plt.plot(
+ df1["running_time_s"], df1["mem_usage_MB"], marker="o", linestyle="-", color="b"
+ )
max_time_count = 10
interval = 1
if len(df1) > max_time_count:
@@ -70,11 +75,16 @@ def plot_mem(docker_csv_path, output_path):
plt.savefig(output_path)
plt.clf()
+
def plot_net(docker_csv_path, output_path):
df1 = pd.read_csv(docker_csv_path)
- plt.plot(df1["running_time_s"], df1["net_tx_kb"], marker="o", linestyle="-", color="b")
- plt.plot(df1["running_time_s"], df1["net_rx_kb"], marker="*", linestyle="-", color="y")
+ plt.plot(
+ df1["running_time_s"], df1["net_tx_kb"], marker="o", linestyle="-", color="b"
+ )
+ plt.plot(
+ df1["running_time_s"], df1["net_rx_kb"], marker="*", linestyle="-", color="y"
+ )
max_time_count = 10
interval = 1
if len(df1) > max_time_count:
diff --git a/benchmark/stats.py b/benchmark/stats.py
index 84b1040..0780167 100644
--- a/benchmark/stats.py
+++ b/benchmark/stats.py
@@ -20,6 +20,7 @@
import time
from datetime import datetime
+
def stream_container_stats(container_name, output_file):
client = docker.from_env()
@@ -27,8 +28,16 @@ def stream_container_stats(container_name, output_file):
container = client.containers.get(container_name)
stats_stream = container.stats(stream=True)
- with open(output_file, 'w', newline='') as csvfile:
- fieldnames = ['cpu_percent', 'mem_usage_MB', 'mem_limit_MB', 'net_tx_kb', 'net_rx_kb', 'running_time_s', 'time']
+ with open(output_file, "w", newline="") as csvfile:
+ fieldnames = [
+ "cpu_percent",
+ "mem_usage_MB",
+ "mem_limit_MB",
+ "net_tx_kb",
+ "net_rx_kb",
+ "running_time_s",
+ "time",
+ ]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
@@ -40,36 +49,54 @@ def stream_container_stats(container_name, output_file):
for stats in stats_stream:
data = json.loads(stats)
running_time_s = int(time.time()) - start_unix_time
- cpu_percent = ((data['cpu_stats']['cpu_usage']['total_usage'] - prev_cpu_total) /
- (data['cpu_stats']['system_cpu_usage'] - prev_cpu_system)) * 100 * os.cpu_count()
- mem_usage = (data['memory_stats']['usage'] - data['memory_stats']['stats']['inactive_file']) / 1024 / 1024
- mem_limit = data['memory_stats']['limit'] / 1024 / 1024
+ cpu_percent = (
+ (
+ (data["cpu_stats"]["cpu_usage"]["total_usage"] - prev_cpu_total)
+ / (data["cpu_stats"]["system_cpu_usage"] - prev_cpu_system)
+ )
+ * 100
+ * os.cpu_count()
+ )
+ mem_usage = (
+ (
+ data["memory_stats"]["usage"]
+ - data["memory_stats"]["stats"]["inactive_file"]
+ )
+ / 1024
+ / 1024
+ )
+ mem_limit = data["memory_stats"]["limit"] / 1024 / 1024
net_tx = 0
net_rx = 0
- for key, value in data['networks'].items():
- net_tx += value['tx_bytes'] / 1024
- net_rx += value['rx_bytes'] / 1024
+ for key, value in data["networks"].items():
+ net_tx += value["tx_bytes"] / 1024
+ net_rx += value["rx_bytes"] / 1024
# skip first five seconds, due to running setting up network
if running_time_s > 5:
- writer.writerow({
- 'cpu_percent': cpu_percent,
- 'mem_usage_MB': int(mem_usage),
- 'mem_limit_MB': int(mem_limit),
- 'net_tx_kb': int((net_tx - prev_net_tx) * 8),
- 'net_rx_kb': int((net_rx - prev_net_rx) * 8),
- 'running_time_s': running_time_s,
- 'time': datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
- })
+ writer.writerow(
+ {
+ "cpu_percent": cpu_percent,
+ "mem_usage_MB": int(mem_usage),
+ "mem_limit_MB": int(mem_limit),
+ "net_tx_kb": int((net_tx - prev_net_tx) * 8),
+ "net_rx_kb": int((net_rx - prev_net_rx) * 8),
+ "running_time_s": running_time_s,
+ "time": datetime.fromtimestamp(time.time()).strftime(
+ "%H:%M:%S"
+ ),
+ }
+ )
prev_net_tx = net_tx
prev_net_rx = net_rx
- prev_cpu_total = data['cpu_stats']['cpu_usage']['total_usage']
- prev_cpu_system = data['cpu_stats']['system_cpu_usage']
+ prev_cpu_total = data["cpu_stats"]["cpu_usage"]["total_usage"]
+ prev_cpu_system = data["cpu_stats"]["system_cpu_usage"]
except docker.errors.NotFound:
print(f"Container {container_name} not found.")
except Exception as e:
- if container.status != 'exited':
+ if container.status != "exited":
print(f"An error occurred: {e} container.status: {container.status}")
+
if __name__ == "__main__":
- stream_container_stats(sys.argv[1], sys.argv[2])
\ No newline at end of file
+ stream_container_stats(sys.argv[1], sys.argv[2])
diff --git a/docs/development/psi_protocol_intro.rst b/docs/development/psi_protocol_intro.rst
index 72b637c..f5b8710 100644
--- a/docs/development/psi_protocol_intro.rst
+++ b/docs/development/psi_protocol_intro.rst
@@ -13,12 +13,12 @@ SecretFlow SPU implements the following PSI protocols,
ECDH-PSI
--------
-The semi-honest DH-PSI protocol is due to Huberman, Franklin, and Hogg [HFH99]_,
+The semi-honest DH-PSI protocol is due to Huberman, Franklin, and Hogg [HFH99]_,
but with roots as far back as Meadows [Mea86]_. It is a semi-honest protocol that
requires exponentiations in a Diffie-Hellman group proportional to the number of items in the sets.
-As a general rule, OT-based PSI protocols are (significantly) faster but require more communication
-than Diffie-Hellman-based PSI protocols.
+As a general rule, OT-based PSI protocols are (significantly) faster but require more communication
+than Diffie-Hellman-based PSI protocols.
In some scenarios, communication cost is overwhelmingly more important than computation cost.
DH-PSI protocol based on the Decisional Diffie-Hellman assumption:
@@ -32,22 +32,22 @@ Curve25519 [Ber06]_ offer a good balance between security and performance.
.. figure:: ../_static/dh_psi.png
-1. For each element :math:`x_i` in its set, Alice applies the hash function and then exponentiates it
- using its key :math:`\alpha`, thus computing :math:`{H(x_i)}^\alpha` . Alice sends
+1. For each element :math:`x_i` in its set, Alice applies the hash function and then exponentiates it
+ using its key :math:`\alpha`, thus computing :math:`{H(x_i)}^\alpha` . Alice sends
:math:`\{\{H(x_i)\}^\alpha\}_{i=1}^{n_1}` to Bob.
-2. For each element :math:`{H(x_i)}^\alpha` received from Alice in the previous step, Bob exponentiates
- it using its key :math:`\beta`, computing :math:`{H(x_i)}^{\alpha\beta}`.
+2. For each element :math:`{H(x_i)}^\alpha` received from Alice in the previous step, Bob exponentiates
+ it using its key :math:`\beta`, computing :math:`{H(x_i)}^{\alpha\beta}`.
Bob sends :math:`{\{\{H(x_i)\}^{\alpha\beta}\}}_{i=1}^{n_1}` to Alice.
-3. For each element :math:`y_i` in its set, Bob applies the hash function and then exponentiates it
- using its key :math:`\beta`, thus computing :math:`{H(y_i)}^\beta` .
+3. For each element :math:`y_i` in its set, Bob applies the hash function and then exponentiates it
+ using its key :math:`\beta`, thus computing :math:`{H(y_i)}^\beta` .
Bob sends the set :math:`\{\{H(y_i)\}^\beta\}_{i=1}^{n_2}` to Alice.
-4. For each element :math:`{H(y_i)}^\beta` received from Bob in the previous step, Alice exponentiates
- it using its key :math:`\alpha`, computing :math:`{H(y_i)}^{\beta\alpha}` .
+4. For each element :math:`{H(y_i)}^\beta` received from Bob in the previous step, Alice exponentiates
+ it using its key :math:`\alpha`, computing :math:`{H(y_i)}^{\beta\alpha}` .
-5. Alice compares two set :math:`{\{\{H(x_i)\}^{\alpha\beta}\}}_{i=1}^{n_1}`
+5. Alice compares two set :math:`{\{\{H(x_i)\}^{\alpha\beta}\}}_{i=1}^{n_1}`
and :math:`{\{\{H(y_i)\}^{\beta\alpha}\}}_{i=1}^{n_2}` and gets intersection.
The Elliptic Curve groups, supported in secretflow SPU PSI module.
@@ -74,27 +74,27 @@ ECDH-PSI (3P)
We implement our own three-party PSI protocol based on ECDH. Note that our implementation has known
leakage, please use at your own risk.
-Assume Alice, Bob, Charlie (receiver) want to perform 3P PSI, in addition to the final output, our
+Assume Alice, Bob, Charlie (receiver) want to perform 3P PSI, in addition to the final output, our
protocol leaks the intersection size of Alice's data and Bob's data to Charlie.
.. figure:: ../_static/dh_psi_3p.png
-Note that at the beginning of ECDH-PSI protocol, we assume the input data from both Alice and Charlie are
+Note that at the beginning of ECDH-PSI protocol, we assume the input data from both Alice and Charlie are
shuffled (It's not necessary to shuffle Bob's set).
Protocol:
1. For i-th element in its set, Alice calculates :math:`H(x_i)^\alpha` and sends to Bob.
-2. For i-th element, Bob calculates :math:`H(x_i)^{\alpha\beta}` and
+2. For i-th element, Bob calculates :math:`H(x_i)^{\alpha\beta}` and
:math:`H(y_i)^\beta`, then shuffles them randomly and sends them to Alice.
-3. For i-th element, Alice calculates :math:`H(y_i)^{\alpha\beta}` and gets the intersection of
- :math:`H(x_i)^{\alpha\beta} \cap H(y_i)^{\alpha\beta}` (we denote the intersection as
+3. For i-th element, Alice calculates :math:`H(y_i)^{\alpha\beta}` and gets the intersection of
+ :math:`H(x_i)^{\alpha\beta} \cap H(y_i)^{\alpha\beta}` (we denote the intersection as
:math:`I^{\alpha\beta}`), then sends :math:`I^{\alpha\beta}` to Charlie.
-4. For i-th element, Charlie sends :math:`H(z_i)^{\gamma}` to Bob, Bob calculates and sends to
- Alice :math:`H(z_i)^{\beta\gamma}`, finally Alice calculates and sends to
+4. For i-th element, Charlie sends :math:`H(z_i)^{\gamma}` to Bob, Bob calculates and sends to
+ Alice :math:`H(z_i)^{\beta\gamma}`, finally Alice calculates and sends to
Charlie :math:`H(z_i)^{\alpha\beta\gamma}`.
5. Charlie calculates :math:`I^{\alpha\beta\gamma}` and compares :math:`I^{\alpha\beta\gamma}` with
@@ -103,9 +103,9 @@ Protocol:
KKRT16-PSI
----------
-[KKRT16]_ is semi-honest OT-based PSI, based on OT Extension, BaRK-OPRF and CuckooHash.
-[KKRT16]_ is the first PSI protocol requiring only one minute for the case of larger sets
-( :math:`2^{24}` items each) of long strings (128 bits).
+[KKRT16]_ is semi-honest OT-based PSI, based on OT Extension, BaRK-OPRF and CuckooHash.
+[KKRT16]_ is the first PSI protocol requiring only one minute for the case of larger sets
+( :math:`2^{24}` items each) of long strings (128 bits).
We use 3-way stash-less CuckooHash proposed in [PSZ18]_.
@@ -127,20 +127,20 @@ Protocol. Our implementation bases on ECDH-PSI, and provides:
- Differentially private PSI results.
-This feature is currently under test, please use at your own risk!
+This feature is currently under test, please use at your own risk!
Why PSI with differentially private results? If we want a scheme that protects
both the private inputs and output privacy, an ideal way is to use `circuit
PSI`, which is a typical PSI variant that allows secure computation (e.g. MPC or
HE) on the PSI result without revealing it. `PSTY19
`_ However those protocols are expensive
-in terms of efficiency.
+in terms of efficiency.
DP-PSI is a way of utilizing the up-sampling and sub-sampling mechanism to add
-calibrated noises to the PSI results, without revealing its concise value.
+calibrated noises to the PSI results, without revealing its concise value.
The protocol is listed below, assume Alice has a (hashed and shuffled) set
-:math:`X` and Bob has a (hashed and shuffled) :math:`Y`.
+:math:`X` and Bob has a (hashed and shuffled) :math:`Y`.
.. figure:: ../_static/dp_psi.png
@@ -151,33 +151,33 @@ Protocol:
1. Alice and Bob first encrypts their own dataset, and gets :math:`X^a` and
:math:`Y^b` separately.
-
+
2. Alice sends :math:`X^a` to Bob.
-
+
3. Bob performs random subsampling on :math:`Y^b`, gets :math:`Y_*^b` and sends it
to Alice. In the meantime, on receiving :math:`X^a` from Alice, Bob
re-encrypts it with :math:`b`, gets :math:`X^{ab}`. Then it samples a random
permutation :math:`\pi` to permute Alice's set, and sends permuted
:math:`\pi(X^{ab})` back to Alice.
-
+
4. On receiving :math:`Y_*^b` and :math:`\pi(X^{ab})` from Bob, Alice re-encrypts
:math:`Y_*^b` and gets :math:`Y_*^{ab}`, then calculates the intersection
:math:`I_*^{ab}\gets\pi(X^{ab})\cap Y_*^{ab}`.
-
+
5. Alice randomly subsamples the intersection, gets :math:`I_{**}^{ab}`, and
then finds their corresponding index in :math:`Y_*^b`. Then randomly adds
non-intersection index to this set.
-
+
6. Alice sends the index set to Bob, then Bob reveals the final results.
In the end, this scheme ensures that the receiver (Bob) only learns the noised
intersection, without the ability of pointing out whether an element is in the
-actual set intersection or not.
+actual set intersection or not.
Note that multiple invocations of DP-PSI inevitably weaken the privacy
protection, therefore, we strongly suggest that user should implement a
protection mechanism to prevent multiple DP-PSI executions on the same input
-value.
+value.
+---------------------------+--------+---------+---------+---------+-----------+
| Intel(R) Xeon(R) Platinum | 2^20 | 2^21 | 2^22 | 2^23 | 2^24 |
@@ -194,31 +194,31 @@ Unbalanced PSI
Ecdh-OPRF based PSI
>>>>>>>>>>>>>>>>>>>
-[RA18]_ section 3 introduces Basic Unbalanced PSI(Ecdh-OPRF based) protocol proposed in [BBCD+11]_ that relaxes
+[RA18]_ section 3 introduces Basic Unbalanced PSI(Ecdh-OPRF based) protocol proposed in [BBCD+11]_ that relaxes
the security of the [JL10]_ to be secure against semi-honest adversaries. The protocol has two phases, the preprocessing phase and the online phase. The
authors introduced many optimizations to push as much computation and communication cost to
the preprocessing phase as possible.
-An Oblivious Pseudorandom Function (OPRF) is a two-party protocol between client and server for computing the
-output of a Pseudorandom Function (PRF). [draft-irtf-cfrg-voprf-10]_ specifies OPRF, VOPRF, and POPRF protocols
+An Oblivious Pseudorandom Function (OPRF) is a two-party protocol between client and server for computing the
+output of a Pseudorandom Function (PRF). [draft-irtf-cfrg-voprf-10]_ specifies OPRF, VOPRF, and POPRF protocols
built upon prime-order groups.
.. figure:: ../_static/ecdh_oprf_psi.jpg
- Offline Phase
-
- 1. For each element :math:`y_i` in its set, Bob applies PRF using
- private key :math:`\beta`, i.e. computing :math:`H_2(y_i,{H_1(y_i)}^\beta)` .
-
+
+ 1. For each element :math:`y_i` in its set, Bob applies PRF using
+ private key :math:`\beta`, i.e. computing :math:`H_2(y_i,{H_1(y_i)}^\beta)` .
+
2. Bob sends :math:`\{\{H_2(y_i,{H_1(y_i)}^\beta)\}\}_{i=1}^{n_2}` to Alice in shuffled order.
-
+
- Online Phase
-
- 1. For each element :math:`x_i` in its set, Alice applies the hash function and then exponentiates
- it using its blind key :math:`r_i`, thus computing :math:`{H_1(x_i)}^{r_i}`. Alice sends
+
+ 1. For each element :math:`x_i` in its set, Alice applies the hash function and then exponentiates
+ it using its blind key :math:`r_i`, thus computing :math:`{H_1(x_i)}^{r_i}`. Alice sends
:math:`\{\{H_1(x_i)\}^{r_i}\}_{i=1}^{n_1}` to Bob.
- 2. For each element :math:`H_1(x_i)^{r_i}` received from Alice in the previous step, Bob exponentiates
- it using its key :math:`\beta`, computing :math:`{H_1(x_i)}^{r_i\beta}`.
+ 2. For each element :math:`H_1(x_i)^{r_i}` received from Alice in the previous step, Bob exponentiates
+ it using its key :math:`\beta`, computing :math:`{H_1(x_i)}^{r_i\beta}`.
Bob sends :math:`{\{\{H_1(x_i)\}^{\{r_i\}\beta}\}}_{i=1}^{n_1}` to Alice.
3. Alice receives :math:`{\{\{H_1(x_i)\}^{r_i\beta}\}}_{i=1}^{n_1}` from Bob, and unblinds it using :math:`r_i`,
gets :math:`\{\{\{H_1(x_i)\}^\beta\}\}_{i=1}^{n_1}`, computes OPRF :math:`\{\{H_2(x_i,{H_1(x_i)}^\beta)\}\}_{i=1}^{n_1}`.
@@ -228,17 +228,17 @@ built upon prime-order groups.
Labeled PSI
>>>>>>>>>>>
-Somewhat homomorphic encryption (SHE) can be used to build efficient (labeled) Private Set Intersection
-protocols in the unbalanced setting, where one of the sets is much larger than the other.
-[CMGD+21]_ introduces several optimizations and improvements to the protocols of
-[CLR17]_, [CHLR18]_, resulting in improved running time and improved communication complexity in the
+Somewhat homomorphic encryption (SHE) can be used to build efficient (labeled) Private Set Intersection
+protocols in the unbalanced setting, where one of the sets is much larger than the other.
+[CMGD+21]_ introduces several optimizations and improvements to the protocols of
+[CLR17]_, [CHLR18]_, resulting in improved running time and improved communication complexity in the
sender's set size.
-Microsoft `APSI (Asymmetric PSI) `_ library provides a PSI functionality
-for asymmetric set sizes based on the latest [CMGD+21]_. APSI uses the BFV([FV12]_) encryption scheme implemented
+Microsoft `APSI (Asymmetric PSI) `_ library provides a PSI functionality
+for asymmetric set sizes based on the latest [CMGD+21]_. APSI uses the BFV([FV12]_) encryption scheme implemented
in the Microsoft [SEAL]_ library.
-SecretFlow SPU wraps `APSI `_ library, can be used for
+SecretFlow SPU wraps `APSI `_ library, can be used for
- Unbalanced PSI
- Malicious PSI
@@ -248,26 +248,26 @@ SecretFlow SPU wraps `APSI `_ library, can be
.. figure:: ../_static/labeled_psi.png
- Setup Phase
-
+
1. **Choose ItemParams**, TableParams, QueryParams, SEALParams.
- 2. **Sender's OPRF**: The sender samples a key :math:`\beta` for the OPRF, updates its items set
+ 2. **Sender's OPRF**: The sender samples a key :math:`\beta` for the OPRF, updates its items set
to :math:`\{\{H_2(s_i,{H_1(s_i)}^\beta)\}\}_{s_i\in S}`.
3. **Sender's Hashing**: Sender inserts all :math:`s_i\in S` into the sets :math:`\mathcal{B}[h_0(s_i)]`,
:math:`\mathcal{B}[h_1(s_i)]` and :math:`\mathcal{B}[h_2(s_i)]`.
4. **Splitting**: For each set :math:`\mathcal{B}[i]`, the sender splits it into bin bundles, denoted as
:math:`\mathcal{B}[i,1]`, ..., :math:`\mathcal{B}[i,k]`.
- 5. **Computing Coeffcients**:
-
- - **Matching Polynomial**: For each bin bundle :math:`\mathcal{B}[i,j]`, the sender computes the
+ 5. **Computing Coeffcients**:
+
+ - **Matching Polynomial**: For each bin bundle :math:`\mathcal{B}[i,j]`, the sender computes the
matching polynomial over :math:`\mathbb{F}_t`.
- - **Label Polynomial**: If the sender has labels associated with its set, then for each bin bundle
+ - **Label Polynomial**: If the sender has labels associated with its set, then for each bin bundle
:math:`\mathcal{B}[i,j]`, the sender interpolates the label polynomial over :math:`\mathbb{F}_t`.
-
+
- Intersection Phase
-
+
1. Receiver Encrypt :math:`r_i \in R`.
- - **Receiver's OPRF**: Receiver and Sender run ecdh-OPRF protocol, get
+ - **Receiver's OPRF**: Receiver and Sender run ecdh-OPRF protocol, get
:math:`\{\{H_2(r_i,{H_1(r_i)}^\beta)\}\}_{r_i\in R}`.
- **Receiver's CuckooHash**: Receiver performs cuckoo hashing on the set :math:`R` into CuckooTable C with m bins
using h1; h2; h3 has the hash functions.
@@ -276,9 +276,9 @@ SecretFlow SPU wraps `APSI `_ library, can be
- **Encrypt**: The receiver uses *FHE.Encrypt* to encrypt query powers and sends the ciphertexts to the sender.
2. **Sender Homomorphically evaluate Matching Polynomial**: The sender receives the collection of
- ciphertexts and homomorphically evaluates Matching Polynomial. If Labeled PSI is desired, Sender homomorphically evaluates
+ ciphertexts and homomorphically evaluates Matching Polynomial. If Labeled PSI is desired, Sender homomorphically evaluates
Label Polynomial. The sender sends evaluated ciphertexts to Receiver.
- 3. **Receiver Decrypt and Get result**: receiver receives and decrypts the matching ciphertexts, and labels
+ 3. **Receiver Decrypt and Get result**: receiver receives and decrypts the matching ciphertexts, and labels
ciphertexts if needed, outputs the matching set and labels.
Labeled PSI Parameters
@@ -325,15 +325,15 @@ RR22 Blazing Fast PSI
[RS21]_ introduced an efficient PSI protocol based on OKVS and VOLE. [RR22]_ present significant improvements
to the OKVS data structure along with new techniquesfor further reducing the communication overhead of [RS]21.
-Oblivous Key-Value Stores(OKVS) consists of algorithms Encode and Decode. Encode takes a list of key-value (k,v)
-pairs as input and returns an abstract data structure S. Decode takes such a data structure S and a key k' as
+Oblivous Key-Value Stores(OKVS) consists of algorithms Encode and Decode. Encode takes a list of key-value (k,v)
+pairs as input and returns an abstract data structure S. Decode takes such a data structure S and a key k' as
input, and gives some output v'.
Pseudorandom correlation generators(PCGs) allow for the efficient generation of
oblivious transfer (OT) and vector oblivious linear evaluations (VOLE)
with sublinear communication and concretely good computational overhead.
-PCG makes use of a so-called LPN-friendly errorcorrecting code.
-`secretflow/YACL `_ provides VOLE code implementation.
+PCG makes use of a so-called LPN-friendly errorcorrecting code.
+`secretflow/YACL `_ provides VOLE code implementation.
LPN-friendly coeds now support [CRR21]_ silver codes(LDPC) and [BCGI+22]_ Expand-Accumulate Codes.
Silver is Most efficient, but not recommended to use due to its security flaw.
@@ -341,7 +341,7 @@ Semi-honest Protocol:
.. figure:: ../_static/rr22_psi.png
-1. The Receiver samples :math:`r \leftarrow \{0,1\}^\kappa` and computes
+1. The Receiver samples :math:`r \leftarrow \{0,1\}^\kappa` and computes
:math:`\vec{P} := \mathrm{Encode} (L,r)` where
:math:`L := \{(H^{n*m}(x,r),H(x))|x \in X\}`.
@@ -349,11 +349,11 @@ Semi-honest Protocol:
:math:`\vec{B}`, Receiver gets :math:`\vec{A}` and :math:`\vec{C}`, such that:
:math:`\vec{C}=\Delta *\vec{A'}+\vec{B}`.
-3. Receiver sends :math:`r, \vec{A}=\vec{A'}+\vec{P}` to Sender. Sender defines
+3. Receiver sends :math:`r, \vec{A}=\vec{A'}+\vec{P}` to Sender. Sender defines
:math:`\vec{K}=\vec{B}+\Delta \cdot \vec{A}`.
-4. Sender sends :math:`Y'=H^{n*m}(\vec{Y},r)\cdot \vec{K}-\Delta \cdot H(\vec{Y})`
- to the Receiver.
+4. Sender sends :math:`Y'=H^{n*m}(\vec{Y},r)\cdot \vec{K}-\Delta \cdot H(\vec{Y})`
+ to the Receiver.
5. Receiver compares :math:`X'=H^{n*m}(\vec{X},r)\cdot \vec{C}` and :math:`Y'`, outputs
intersection result :math:`X \cap Y`.
@@ -368,14 +368,14 @@ Reference
OT extension and silent non-interactive secure computation. In ACM CCS 2019, pages 291–308.
ACM Press, November 2019.
-.. [BCG+19b] E. Boyle, G. Couteau, N. Gilboa, Y. Ishai, L. Kohl, P. Rindal, and P. Scholl.
+.. [BCG+19b] E. Boyle, G. Couteau, N. Gilboa, Y. Ishai, L. Kohl, P. Rindal, and P. Scholl.
Efficient two-round OT extension and silent non-interactive secure computation. In ACM CCS 2019,
pages 291–308. ACM Press, November 2019.
.. [Ber06] Daniel J. Bernstein. Curve25519: new diffie-hellman speed records. In In Public
Key Cryptography (PKC), Springer-Verlag LNCS 3958, page 2006, 2006. (Cited on page 4.)
-.. [BCGI+22] Elette Boyle, Geoffroy Couteau, Niv Gilboa, Yuval Ishai, Lisa Kohl, Nicolas Resch, Peter Scholl.
+.. [BCGI+22] Elette Boyle, Geoffroy Couteau, Niv Gilboa, Yuval Ishai, Lisa Kohl, Nicolas Resch, Peter Scholl.
Correlated Pseudorandomness from Expand-Accumulate Codes. Crypto2022.
.. [BBCD+11] Baldi, P., Baronio, R., Cristofaro, E.D., Gasti, P., Tsudik, G.: Countering GATTACA:
@@ -393,25 +393,25 @@ Reference
B.M., Evans, D., Malkin, T., Xu, D. (eds.) ACM CCS 2017. pp. 1243{1255. ACM Press (Oct / Nov 2017).
https://doi.org/10.1145/3133956.3134061
-.. [CMGD+21] Kelong Cong, Radames Cruz Moreno, Mariana Botelho da Gama, Wei Dai, Ilia Iliashenko, Kim Laine,
- Michael Rosenberg. Labeled PSI from Homomorphic Encryption with Reduced Computation and Communication
- CCS'21: Proceedings of the 2021 ACM SIGSAC Conference on Computer and Communications SecurityNovember 2021
+.. [CMGD+21] Kelong Cong, Radames Cruz Moreno, Mariana Botelho da Gama, Wei Dai, Ilia Iliashenko, Kim Laine,
+ Michael Rosenberg. Labeled PSI from Homomorphic Encryption with Reduced Computation and Communication
+ CCS'21: Proceedings of the 2021 ACM SIGSAC Conference on Computer and Communications SecurityNovember 2021
-.. [CRR21] Geoffroy Couteau, Peter Rindal, and Srinivasan Raghuraman. Silver: Silent VOLE and Oblivious Transfer
+.. [CRR21] Geoffroy Couteau, Peter Rindal, and Srinivasan Raghuraman. Silver: Silent VOLE and Oblivious Transfer
from Hardness of Decoding Structured LDPC Codes. Crypto2021.
.. [DP-PSI] Differentially-Private PSI https://arxiv.org/pdf/2208.13249.pdf
-.. [FourQ] Costello, C., Longa, P.: Fourq: four-dimensional decompositions on a q-curve over the mersenne prime.
+.. [FourQ] Costello, C., Longa, P.: Fourq: four-dimensional decompositions on a q-curve over the mersenne prime.
Cryptology ePrint Archive, Report 2015/565 (2015), https://eprint.iacr.org/2015/565
-.. [FV12] Fan, J., Vercauteren, F.: Somewhat practical fully homomorphic encryption. Cryptology ePrint Archive,
+.. [FV12] Fan, J., Vercauteren, F.: Somewhat practical fully homomorphic encryption. Cryptology ePrint Archive,
Report 2012/144 (2012), http://eprint.iacr.org/2012/144.pdf
.. [HFH99] Bernardo A. Huberman, Matt Franklin, and Tad Hogg. Enhancing privacy and trust in electronic
communities. In ACM CONFERENCE ON ELECTRONIC COMMERCE. ACM, 1999.
-.. [ipp-crypto] https://github.com/intel/ipp-crypto/
+.. [ipp-crypto] https://github.com/intel/ipp-crypto/
.. [JL10] Jarecki, S., Liu, X.: Fast Secure Computation of Set Intersection. In: SCN. LNCS,
vol. 6280, pp. 418–435. Springer (2010)
@@ -425,17 +425,17 @@ Reference
.. [PSZ18] B. Pinkas, T. Schneider, and M. Zohner. Scalable private set intersection based on ot extension.
ACM Transactions on Privacy and Security (TOPS), 21(2):1–35, 2018.
-.. [RA18] Resende, A.C.D., Aranha, D.F.: Faster unbalanced private set intersection. In: Meiklejohn, S.,
- Sako, K. (eds.) FC2018. LNCS, vol. 10957, pp. 203{221. Springer, Heidelberg (Feb / Mar 2018)
+.. [RA18] Resende, A.C.D., Aranha, D.F.: Faster unbalanced private set intersection. In: Meiklejohn, S.,
+ Sako, K. (eds.) FC2018. LNCS, vol. 10957, pp. 203{221. Springer, Heidelberg (Feb / Mar 2018)
.. [RR22] Srinivasan Raghuraman and Peter Rindal. Blazing Fast PSI from Improved OKVS and Subfield VOLE. CCS'22.
-.. [RRT23] Srinivasan Raghuraman, Peter Rindal, Titouan Tanguy. Expand-Convolute Codes for Pseudorandom
+.. [RRT23] Srinivasan Raghuraman, Peter Rindal, Titouan Tanguy. Expand-Convolute Codes for Pseudorandom
Correlation Generators from LPN. Crypto2023.
.. [RS21] Peter Rindal and Phillipp Schoppmann. VOLE-PSI: fast OPRF and circuit-psi from vector-ole. EUROCRYPT2021.
-.. [SEAL] Microsoft SEAL (release 4.0). https://github.com/Microsoft/SEAL (Sep 2022),
+.. [SEAL] Microsoft SEAL (release 4.0). https://github.com/Microsoft/SEAL (Sep 2022),
microsoft Research, Redmond, WA.
.. [SEC2-v2] Standards for Efficient Cryptography (SEC)
@@ -447,5 +447,5 @@ Reference
zero-knowledge proofs for boolean and arithmetic circuits. In 2021 IEEE Symposium on Security
and Privacy (SP), pages 1074–1091. IEEE, 2021.
-.. [draft-irtf-cfrg-voprf-10] Oblivious Pseudorandom Functions (OPRFs) using Prime-Order Groups.
- https://www.ietf.org/archive/id/draft-irtf-cfrg-voprf-10.html
+.. [draft-irtf-cfrg-voprf-10] Oblivious Pseudorandom Functions (OPRFs) using Prime-Order Groups.
+ https://www.ietf.org/archive/id/draft-irtf-cfrg-voprf-10.html
diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index 997f512..d59331a 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -20,7 +20,7 @@ Welcome to SecretFlow PSI Library. There are multiple methods to use PSI/PIR.
For PSI, we have a developing v2 PSI.
+------------------------+------------------------------------------------+---------------------------------------------+
-| | PSI v1 APIs | PSI v2 APIs |
+| | PSI v1 APIs | PSI v2 APIs |
+========================+================================================+=============================================+
| Supported Protocols | ECDH, KKRT, ECDH_OPRF_UB, DP_PSI, RR22 | ECDH, KKRT, RR22, ECDH_OPRF_UB |
+------------------------+------------------------------------------------+---------------------------------------------+
@@ -133,7 +133,7 @@ We use the same dev docker from secretflow/ubuntu-base-ci::
--entrypoint="bash" \
secretflow/ubuntu-base-ci:latest
-
+
# attach to build container
docker exec -it psi-dev-$(whoami) bash
@@ -152,7 +152,7 @@ You need to install:
* xxd
* lld
-For bazel, please check version in `.bazelversion `_ or use bazelisk instead.
+For bazel, please check version in `.bazeliskrc `_ or use bazelisk instead.
Build & UnitTest
^^^^^^^^^^^^^^^^
diff --git a/docs/reference/launch_config.md b/docs/reference/launch_config.md
index 565ae1b..5563fa5 100644
--- a/docs/reference/launch_config.md
+++ b/docs/reference/launch_config.md
@@ -9,7 +9,7 @@ Please check psi.v2.PsiConfig and psi.v2.UbPsiConfig at **PSI v2 Configuration**
- Messages
- [LaunchConfig](#launchconfig)
-
+
@@ -20,7 +20,7 @@ Please check psi.v2.PsiConfig and psi.v2.UbPsiConfig at **PSI v2 Configuration**
- [PartyProto](#partyproto)
- [RetryOptionsProto](#retryoptionsproto)
- [SSLOptionsProto](#ssloptionsproto)
-
+
@@ -163,3 +163,4 @@ SSL options.
|
bool | | bool | boolean | boolean |
| string | A string must always contain UTF-8 encoded or 7-bit ASCII text. | string | String | str/unicode |
| bytes | May contain any arbitrary sequence of bytes. | string | ByteString | str |
+
diff --git a/docs/reference/pir_config.md b/docs/reference/pir_config.md
index 68cf9fe..5dc2498 100644
--- a/docs/reference/pir_config.md
+++ b/docs/reference/pir_config.md
@@ -8,7 +8,7 @@
- [ApsiReceiverConfig](#apsireceiverconfig)
- [ApsiSenderConfig](#apsisenderconfig)
- [PirResultReport](#pirresultreport)
-
+
@@ -100,3 +100,4 @@ The report of pir task.
| bool | | bool | boolean | boolean |
| string | A string must always contain UTF-8 encoded or 7-bit ASCII text. | string | String | str/unicode |
| bytes | May contain any arbitrary sequence of bytes. | string | ByteString | str |
+
diff --git a/docs/reference/psi_config.md b/docs/reference/psi_config.md
index 8c6f7d1..dd7b8c4 100644
--- a/docs/reference/psi_config.md
+++ b/docs/reference/psi_config.md
@@ -11,13 +11,13 @@
- [MemoryPsiConfig](#memorypsiconfig)
- [OutputParams](#outputparams)
- [PsiResultReport](#psiresultreport)
-
+
- Enums
- [CurveType](#curvetype)
- [PsiType](#psitype)
-
+
- [Scalar Value Types](#scalar-value-types)
@@ -198,3 +198,4 @@ The algorithm type of psi.
| bool | | bool | boolean | boolean |
| string | A string must always contain UTF-8 encoded or 7-bit ASCII text. | string | String | str/unicode |
| bytes | May contain any arbitrary sequence of bytes. | string | ByteString | str |
+
diff --git a/docs/reference/psi_v2_config.md b/docs/reference/psi_v2_config.md
index ff21ffe..4fcf189 100644
--- a/docs/reference/psi_v2_config.md
+++ b/docs/reference/psi_v2_config.md
@@ -16,7 +16,7 @@
- [RecoveryConfig](#recoveryconfig)
- [Rr22Config](#rr22config)
- [UbPsiConfig](#ubpsiconfig)
-
+
- Enums
@@ -26,7 +26,7 @@
- [RecoveryCheckpoint.Stage](#recoverycheckpointstage)
- [Role](#role)
- [UbPsiConfig.Mode](#ubpsiconfigmode)
-
+
- [Scalar Value Types](#scalar-value-types)
@@ -466,3 +466,4 @@ Role of parties.
| bool | | bool | boolean | boolean |
| string | A string must always contain UTF-8 encoded or 7-bit ASCII text. | string | String | str/unicode |
| bytes | May contain any arbitrary sequence of bytes. | string | ByteString | str |
+
diff --git a/docs/user_guide/apsi_benchmark.md b/docs/user_guide/apsi_benchmark.md
index b62f0c7..e7ebca0 100644
--- a/docs/user_guide/apsi_benchmark.md
+++ b/docs/user_guide/apsi_benchmark.md
@@ -19,9 +19,9 @@ To measure the performance of APSI protocols under different data scales, we nee
```python
-# one million key-value pairs, each value's length is 32-byte,
+# one million key-value pairs, each value's length is 32-byte,
python examples/pir/apsi/test_data_creator.py --sender_size=1000000 --receiver_size=1 --intersection_size=1 --label_byte_count=32
-# 16 million key-value pairs, each value's length is 32-byte,
+# 16 million key-value pairs, each value's length is 32-byte,
python examples/pir/apsi/test_data_creator.py --sender_size=16000000 --receiver_size=1 --intersection_size=1 --label_byte_count=32
```
@@ -134,10 +134,10 @@ docker start apsi_sender
docker exec -it apsi_sender bash
```
-Then run:
+Then run:
```bash
-# offline
+# offline
./main --config $(pwd)/examples/pir/config/apsi_sender_setup.json
# online
./main --config $(pwd)/examples/pir/config/apsi_sender_online.json
@@ -216,3 +216,5 @@ If you wish to measure the APSI performance for a specific data scale and label
Note that the above data does not represent the optimal performance of APSI. Under fixed data scale conditions, the query performance of APSI is highly correlated with the corresponding parameters. Additionally, if you want to support larger datasets, such as one billion data entries, we also offer a bucket mode. However, this mode requires consideration of more parameters, so it is not displayed in this benchmark.
+
+
diff --git a/docs/user_guide/faq.md b/docs/user_guide/faq.md
index 0d479c4..bfaa6e9 100644
--- a/docs/user_guide/faq.md
+++ b/docs/user_guide/faq.md
@@ -88,3 +88,4 @@ If a PSI task fails, just restart the task with the same config, the progress wi
3. What is **Easy PSI**? Why and when to use **Easy PSI**?
[Easy PSI](https://www.secretflow.org.cn/zh-CN/docs/easy-psi) is a standalone PSI product powered by this library. It provides a simple User Interface and utilize [Kuscia](https://www.secretflow.org.cn/docs/kuscia) to launch PSI binaries between both parties.
+
diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst
index ceabfc7..af79ae4 100644
--- a/docs/user_guide/index.rst
+++ b/docs/user_guide/index.rst
@@ -12,3 +12,4 @@ PSI v2 is recommended to use. We are still working on PIR code refactoring.
faq
psi_v2_benchmark
apsi_benchmark
+
diff --git a/docs/user_guide/pir.rst b/docs/user_guide/pir.rst
index 695bc71..d67ee86 100644
--- a/docs/user_guide/pir.rst
+++ b/docs/user_guide/pir.rst
@@ -47,9 +47,9 @@ If you want to try a similar CLI like APSI, you could compile the source code by
.. code-block::
- bazel build psi/apsi_wrapper/cli:receiver
+ bazel build psi/wrapper/apsi/cli:receiver
- bazel build psi/apsi_wrapper/cli:sender
+ bazel build psi/wrapper/apsi/cli:sender
And run CLI like
@@ -57,9 +57,9 @@ And run CLI like
.. code-block::
- ./bazel-bin/psi/apsi_wrapper/cli/sender
+ ./bazel-bin/psi/wrapper/apsi/cli/sender
- ./bazel-bin/psi/apsi_wrapper/cli/receiver
+ ./bazel-bin/psi/wrapper/apsi/cli/receiver
Prepare data and config
@@ -113,7 +113,7 @@ PIR Config
""""""""""
1. Sender: Setup Stage. In this stage, sender generates sender db file with csv file. This stage is offline.
-Since version **0.4.0b0**, the source csv file for db generating should be specified as **source_file**, and **db_file**
+Since version **0.4.0b0**, the source csv file for db generating should be specified as **source_file**, and **db_file**
is used to specify db file.
.. code-block::
@@ -145,7 +145,7 @@ is used to specify db file.
{
"id": "receiver",
"host": "127.0.0.1:5400"
-
+
.. code-block::
:caption: apsi_sender_setup.json
diff --git a/docs/user_guide/psi.rst b/docs/user_guide/psi.rst
index c41cd3f..af9b6ce 100644
--- a/docs/user_guide/psi.rst
+++ b/docs/user_guide/psi.rst
@@ -7,9 +7,9 @@ Quick start with Private Set Intersection (PSI) V1 APIs.
Supported Protocols
----------------------
-The :psi_code_host:`ECDH-PSI ` is favorable if the bandwidth is the bottleneck.
+The :psi_code_host:`ECDH-PSI ` is favorable if the bandwidth is the bottleneck.
If the computing is the bottleneck, you should try the BaRK-OPRF based
-PSI :psi_code_host:`KKRT-PSI `.
+PSI :psi_code_host:`KKRT-PSI `.
+---------------+--------------+--------------+--------------+
| PSI protocols | Threat Model | Party Number | PsiTypeCode |
@@ -27,11 +27,11 @@ PSI :psi_code_host:`KKRT-PSI `.
| `DP-PSI`_ | Semi-Honest | 2P | - |
+---------------+--------------+--------------+--------------+
-MPC and PSI protocols are designed for specific Security model (or Threat Models).
+MPC and PSI protocols are designed for specific Security model (or Threat Models).
-Security model are widely considered to capture the capabilities of adversaries.
+Security model are widely considered to capture the capabilities of adversaries.
Adversaries of semi-honest model and malicious model are Semi-honest Adversary and
-Malicious Adversary.
+Malicious Adversary.
- `Semi-honest Adversary `_
- `Malicious Adversary `_
@@ -92,12 +92,12 @@ Then use binary with::
Benchmark
----------
-benchmark result without data load time
+benchmark result without data load time
ECDH PSI Benchmark
>>>>>>>>>>>>>>>>>>
-:psi_code_host:`DH-PSI benchmark code `
+:psi_code_host:`DH-PSI benchmark code `
cpu limited by docker(--cpu)
@@ -127,7 +127,7 @@ cpu limited by docker(--cpu)
KKRT PSI Benchmark
>>>>>>>>>>>>>>>>>>>
-All of our experiments use a single thread for each party.
+All of our experiments use a single thread for each party.
If the bandwidth is enough, the upstream could try to perform multi-threading optimizations
@@ -161,6 +161,6 @@ Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz
Security Tips
-------------
-Warning: `KKRT16 `_ is semi-honest PSI protocols,
+Warning: `KKRT16 `_ is semi-honest PSI protocols,
and may be attacked in malicious model.
We recommend using KKRT16 PSI protocol as one-way PSI, i.e., one party gets the final intersection result.
diff --git a/docs/user_guide/psi_v2.rst b/docs/user_guide/psi_v2.rst
index fd0987e..3b75ddd 100644
--- a/docs/user_guide/psi_v2.rst
+++ b/docs/user_guide/psi_v2.rst
@@ -117,7 +117,7 @@ To launch PSI, please check LaunchConfig at :doc:`/reference/launch_config` and
You need to prepare following files:
+------------------------+------------------------------------------------+-------------------------------------------------------------------------------+
-| File Name | Location | Description |
+| File Name | Location | Description |
+========================+================================================+===============================================================================+
| receiver.config | /tmp/receiver/receiver.config | Config for receiver. |
+------------------------+------------------------------------------------+-------------------------------------------------------------------------------+
diff --git a/psi/legacy/kmprt17_mp_psi/BUILD.bazel b/experiment/psi/kmprt17_mp_psi/BUILD.bazel
similarity index 97%
rename from psi/legacy/kmprt17_mp_psi/BUILD.bazel
rename to experiment/psi/kmprt17_mp_psi/BUILD.bazel
index c08ce10..be525bf 100644
--- a/psi/legacy/kmprt17_mp_psi/BUILD.bazel
+++ b/experiment/psi/kmprt17_mp_psi/BUILD.bazel
@@ -32,7 +32,7 @@ psi_cc_library(
"//psi/utils:communication",
"//psi/utils:sync",
"//psi/utils:test_utils",
- "@com_google_absl//absl/types:span",
+ "@abseil-cpp//absl/types:span",
"@yacl//yacl/base:exception",
"@yacl//yacl/base:int128",
"@yacl//yacl/crypto/hash:hash_utils",
diff --git a/psi/legacy/kmprt17_mp_psi/kmprt17_hashing.cc b/experiment/psi/kmprt17_mp_psi/kmprt17_hashing.cc
similarity index 97%
rename from psi/legacy/kmprt17_mp_psi/kmprt17_hashing.cc
rename to experiment/psi/kmprt17_mp_psi/kmprt17_hashing.cc
index ebd537d..b9c49aa 100644
--- a/psi/legacy/kmprt17_mp_psi/kmprt17_hashing.cc
+++ b/experiment/psi/kmprt17_mp_psi/kmprt17_hashing.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/legacy/kmprt17_mp_psi/kmprt17_hashing.h"
+#include "experiment/psi/kmprt17_mp_psi/kmprt17_hashing.h"
#include
#include
diff --git a/psi/legacy/kmprt17_mp_psi/kmprt17_hashing.h b/experiment/psi/kmprt17_mp_psi/kmprt17_hashing.h
similarity index 100%
rename from psi/legacy/kmprt17_mp_psi/kmprt17_hashing.h
rename to experiment/psi/kmprt17_mp_psi/kmprt17_hashing.h
diff --git a/psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi.cc b/experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi.cc
similarity index 97%
rename from psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi.cc
rename to experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi.cc
index 8048881..d8a38ec 100644
--- a/psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi.cc
+++ b/experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi.cc
@@ -12,15 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi.h"
+#include "experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi.h"
#include
+#include "experiment/psi/kmprt17_mp_psi/kmprt17_opprf.h"
#include "yacl/crypto/hash/hash_utils.h"
#include "yacl/crypto/rand/rand.h"
#include "yacl/utils/serialize.h"
-#include "psi/legacy/kmprt17_mp_psi/kmprt17_opprf.h"
#include "psi/utils/communication.h"
#include "psi/utils/sync.h"
diff --git a/psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi.h b/experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi.h
similarity index 100%
rename from psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi.h
rename to experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi.h
diff --git a/psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi_test.cc b/experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi_test.cc
similarity index 98%
rename from psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi_test.cc
rename to experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi_test.cc
index c8c865d..3bae534 100644
--- a/psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi_test.cc
+++ b/experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/legacy/kmprt17_mp_psi/kmprt17_mp_psi.h"
+#include "experiment/psi/kmprt17_mp_psi/kmprt17_mp_psi.h"
#include
#include
diff --git a/psi/legacy/kmprt17_mp_psi/kmprt17_opprf.cc b/experiment/psi/kmprt17_mp_psi/kmprt17_opprf.cc
similarity index 98%
rename from psi/legacy/kmprt17_mp_psi/kmprt17_opprf.cc
rename to experiment/psi/kmprt17_mp_psi/kmprt17_opprf.cc
index f5c0ae5..28257fa 100644
--- a/psi/legacy/kmprt17_mp_psi/kmprt17_opprf.cc
+++ b/experiment/psi/kmprt17_mp_psi/kmprt17_opprf.cc
@@ -12,12 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/legacy/kmprt17_mp_psi/kmprt17_opprf.h"
+#include "experiment/psi/kmprt17_mp_psi/kmprt17_opprf.h"
#include
#include
#include
+#include "experiment/psi/kmprt17_mp_psi/kmprt17_hashing.h"
#include "yacl/crypto/rand/rand.h"
#include "yacl/crypto/tools/ro.h"
#include "yacl/kernel/algorithms/base_ot.h"
@@ -26,8 +27,6 @@
#include "yacl/link/link.h"
#include "yacl/utils/serialize.h"
-#include "psi/legacy/kmprt17_mp_psi/kmprt17_hashing.h"
-
namespace psi::psi {
namespace yc = yacl::crypto;
diff --git a/psi/legacy/kmprt17_mp_psi/kmprt17_opprf.h b/experiment/psi/kmprt17_mp_psi/kmprt17_opprf.h
similarity index 100%
rename from psi/legacy/kmprt17_mp_psi/kmprt17_opprf.h
rename to experiment/psi/kmprt17_mp_psi/kmprt17_opprf.h
diff --git a/psi/BUILD.bazel b/psi/BUILD.bazel
index c2914bc..8699466 100644
--- a/psi/BUILD.bazel
+++ b/psi/BUILD.bazel
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-load("//bazel:psi.bzl", "psi_cc_binary", "psi_cc_library", "psi_cc_test")
+load("//bazel:psi.bzl", "psi_cc_library")
package(default_visibility = ["//visibility:public"])
@@ -41,59 +41,18 @@ psi_cc_library(
"//psi/utils:recovery",
"//psi/utils:resource_manager",
"//psi/utils:table_utils",
- "@com_github_google_perfetto//:perfetto",
- "@com_google_absl//absl/status",
+ "@abseil-cpp//absl/status",
+ "@perfetto",
"@yacl//yacl/link",
],
)
-psi_cc_library(
- name = "factory",
- srcs = ["factory.cc"],
- hdrs = ["factory.h"],
- deps = [
- "//psi/ecdh:receiver",
- "//psi/ecdh:sender",
- "//psi/ecdh/ub_psi:client",
- "//psi/ecdh/ub_psi:server",
- "//psi/kkrt:receiver",
- "//psi/kkrt:sender",
- "//psi/rr22:receiver",
- "//psi/rr22:sender",
- "@yacl//yacl/base:exception",
- ],
-)
-
-psi_cc_library(
- name = "launch",
- srcs = ["launch.cc"],
- hdrs = ["launch.h"],
- deps = [
- ":factory",
- ":trace_categories",
- "//psi/apsi_wrapper/cli:entry",
- "//psi/legacy:bucket_psi",
- "@boost//:algorithm",
- ],
-)
-
psi_cc_library(
name = "trace_categories",
srcs = ["trace_categories.cc"],
hdrs = ["trace_categories.h"],
deps = [
- "@com_github_google_perfetto//:perfetto",
- ],
-)
-
-psi_cc_test(
- name = "psi_test",
- srcs = ["psi_test.cc"],
- flaky = True,
- deps = [
- ":factory",
- "//psi/utils:arrow_csv_batch_provider",
- "@yacl//yacl/utils:scope_guard",
+ "@perfetto",
],
)
@@ -101,40 +60,3 @@ psi_cc_library(
name = "version",
hdrs = ["version.h"],
)
-
-psi_cc_library(
- name = "kuscia_adapter",
- srcs = [
- "kuscia_adapter.cc",
- ],
- hdrs = [
- "kuscia_adapter.h",
- ],
- deps = [
- "//psi/proto:entry_cc_proto",
- "//psi/proto:kuscia_cc_proto",
- "@com_github_tencent_rapidjson//:rapidjson",
- "@yacl//yacl/base:exception",
- ],
-)
-
-psi_cc_test(
- name = "kuscia_adapter_test",
- srcs = ["kuscia_adapter_test.cc"],
- deps = [
- ":kuscia_adapter",
- ],
-)
-
-psi_cc_binary(
- name = "main",
- srcs = ["main.cc"],
- deps = [
- ":kuscia_adapter",
- ":version",
- "//psi:launch",
- "//psi/proto:entry_cc_proto",
- "//psi/utils:resource_manager",
- "@com_github_gflags_gflags//:gflags",
- ],
-)
diff --git a/psi/ecdh/BUILD.bazel b/psi/algorithm/ecdh/BUILD.bazel
similarity index 98%
rename from psi/ecdh/BUILD.bazel
rename to psi/algorithm/ecdh/BUILD.bazel
index 7b91ff3..7cd1a78 100644
--- a/psi/ecdh/BUILD.bazel
+++ b/psi/algorithm/ecdh/BUILD.bazel
@@ -27,7 +27,7 @@ psi_cc_library(
"//psi/utils:communication",
"//psi/utils:ec_point_store",
"//psi/utils:recovery",
- "@com_google_absl//absl/strings",
+ "@abseil-cpp//absl/strings",
"@yacl//yacl/link",
"@yacl//yacl/utils:parallel",
],
diff --git a/psi/ecdh/common.h b/psi/algorithm/ecdh/common.h
similarity index 100%
rename from psi/ecdh/common.h
rename to psi/algorithm/ecdh/common.h
diff --git a/psi/ecdh/ecdh_3pc_psi.cc b/psi/algorithm/ecdh/ecdh_3pc_psi.cc
similarity index 99%
rename from psi/ecdh/ecdh_3pc_psi.cc
rename to psi/algorithm/ecdh/ecdh_3pc_psi.cc
index 50a6158..fb11d7f 100644
--- a/psi/ecdh/ecdh_3pc_psi.cc
+++ b/psi/algorithm/ecdh/ecdh_3pc_psi.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/ecdh/ecdh_3pc_psi.h"
+#include "psi/algorithm/ecdh/ecdh_3pc_psi.h"
#include
#include
@@ -376,4 +376,4 @@ size_t ShuffleEcdh3PcPsi::GetPartnersPsiPeerRank() {
}
}
-} // namespace psi::ecdh
\ No newline at end of file
+} // namespace psi::ecdh
diff --git a/psi/ecdh/ecdh_3pc_psi.h b/psi/algorithm/ecdh/ecdh_3pc_psi.h
similarity index 99%
rename from psi/ecdh/ecdh_3pc_psi.h
rename to psi/algorithm/ecdh/ecdh_3pc_psi.h
index 29899a7..abe8793 100644
--- a/psi/ecdh/ecdh_3pc_psi.h
+++ b/psi/algorithm/ecdh/ecdh_3pc_psi.h
@@ -18,7 +18,7 @@
#include
#include
-#include "psi/ecdh/ecdh_psi.h"
+#include "psi/algorithm/ecdh/ecdh_psi.h"
#include "psi/utils/communication.h"
namespace psi::ecdh {
diff --git a/psi/ecdh/ecdh_3pc_psi_benchmark.cc b/psi/algorithm/ecdh/ecdh_3pc_psi_benchmark.cc
similarity index 98%
rename from psi/ecdh/ecdh_3pc_psi_benchmark.cc
rename to psi/algorithm/ecdh/ecdh_3pc_psi_benchmark.cc
index e66de39..ebed22a 100644
--- a/psi/ecdh/ecdh_3pc_psi_benchmark.cc
+++ b/psi/algorithm/ecdh/ecdh_3pc_psi_benchmark.cc
@@ -19,7 +19,7 @@
#include "yacl/base/exception.h"
#include "yacl/link/test_util.h"
-#include "psi/ecdh/ecdh_3pc_psi.h"
+#include "psi/algorithm/ecdh/ecdh_3pc_psi.h"
#include "psi/utils/test_utils.h"
static void BM_Ecdh3PcPsi(benchmark::State& state) {
diff --git a/psi/ecdh/ecdh_3pc_psi_test.cc b/psi/algorithm/ecdh/ecdh_3pc_psi_test.cc
similarity index 99%
rename from psi/ecdh/ecdh_3pc_psi_test.cc
rename to psi/algorithm/ecdh/ecdh_3pc_psi_test.cc
index d5001c6..0248c86 100644
--- a/psi/ecdh/ecdh_3pc_psi_test.cc
+++ b/psi/algorithm/ecdh/ecdh_3pc_psi_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/ecdh/ecdh_3pc_psi.h"
+#include "psi/algorithm/ecdh/ecdh_3pc_psi.h"
#include
#include
diff --git a/psi/ecdh/ecdh_logger.h b/psi/algorithm/ecdh/ecdh_logger.h
similarity index 100%
rename from psi/ecdh/ecdh_logger.h
rename to psi/algorithm/ecdh/ecdh_logger.h
diff --git a/psi/ecdh/ecdh_psi.cc b/psi/algorithm/ecdh/ecdh_psi.cc
similarity index 99%
rename from psi/ecdh/ecdh_psi.cc
rename to psi/algorithm/ecdh/ecdh_psi.cc
index 074260a..20d7278 100644
--- a/psi/ecdh/ecdh_psi.cc
+++ b/psi/algorithm/ecdh/ecdh_psi.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/ecdh/ecdh_psi.h"
+#include "psi/algorithm/ecdh/ecdh_psi.h"
#include
#include
diff --git a/psi/ecdh/ecdh_psi.h b/psi/algorithm/ecdh/ecdh_psi.h
similarity index 99%
rename from psi/ecdh/ecdh_psi.h
rename to psi/algorithm/ecdh/ecdh_psi.h
index 1988b2c..942500e 100644
--- a/psi/ecdh/ecdh_psi.h
+++ b/psi/algorithm/ecdh/ecdh_psi.h
@@ -24,8 +24,8 @@
#include "yacl/link/link.h"
+#include "psi/algorithm/ecdh/ecdh_logger.h"
#include "psi/cryptor/ecc_cryptor.h"
-#include "psi/ecdh/ecdh_logger.h"
#include "psi/utils/batch_provider.h"
#include "psi/utils/communication.h"
#include "psi/utils/ec_point_store.h"
diff --git a/psi/ecdh/ecdh_psi_benchmark.cc b/psi/algorithm/ecdh/ecdh_psi_benchmark.cc
similarity index 98%
rename from psi/ecdh/ecdh_psi_benchmark.cc
rename to psi/algorithm/ecdh/ecdh_psi_benchmark.cc
index de981f5..4a2e8ed 100644
--- a/psi/ecdh/ecdh_psi_benchmark.cc
+++ b/psi/algorithm/ecdh/ecdh_psi_benchmark.cc
@@ -20,8 +20,8 @@
#include "yacl/base/exception.h"
#include "yacl/link/test_util.h"
+#include "psi/algorithm/ecdh/ecdh_psi.h"
#include "psi/cryptor/cryptor_selector.h"
-#include "psi/ecdh/ecdh_psi.h"
#include "psi/utils/batch_provider.h"
#include "psi/utils/ec_point_store.h"
diff --git a/psi/ecdh/ecdh_psi_test.cc b/psi/algorithm/ecdh/ecdh_psi_test.cc
similarity index 99%
rename from psi/ecdh/ecdh_psi_test.cc
rename to psi/algorithm/ecdh/ecdh_psi_test.cc
index 271e4c9..c69d751 100644
--- a/psi/ecdh/ecdh_psi_test.cc
+++ b/psi/algorithm/ecdh/ecdh_psi_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/ecdh/ecdh_psi.h"
+#include "psi/algorithm/ecdh/ecdh_psi.h"
#include
#include
diff --git a/psi/ecdh/receiver.cc b/psi/algorithm/ecdh/receiver.cc
similarity index 98%
rename from psi/ecdh/receiver.cc
rename to psi/algorithm/ecdh/receiver.cc
index 9592339..dbe9e71 100644
--- a/psi/ecdh/receiver.cc
+++ b/psi/algorithm/ecdh/receiver.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/ecdh/receiver.h"
+#include "psi/algorithm/ecdh/receiver.h"
#include
@@ -21,8 +21,8 @@
#include "yacl/base/exception.h"
#include "yacl/utils/scope_guard.h"
+#include "psi/algorithm/ecdh/common.h"
#include "psi/cryptor/cryptor_selector.h"
-#include "psi/ecdh/common.h"
#include "psi/trace_categories.h"
#include "psi/utils/sync.h"
diff --git a/psi/ecdh/receiver.h b/psi/algorithm/ecdh/receiver.h
similarity index 96%
rename from psi/ecdh/receiver.h
rename to psi/algorithm/ecdh/receiver.h
index cdd72ef..938498f 100644
--- a/psi/ecdh/receiver.h
+++ b/psi/algorithm/ecdh/receiver.h
@@ -13,7 +13,7 @@
// limitations under the License.
#pragma once
-#include "psi/ecdh/ecdh_psi.h"
+#include "psi/algorithm/ecdh/ecdh_psi.h"
#include "psi/interface.h"
#include "psi/utils/arrow_csv_batch_provider.h"
diff --git a/psi/ecdh/sender.cc b/psi/algorithm/ecdh/sender.cc
similarity index 98%
rename from psi/ecdh/sender.cc
rename to psi/algorithm/ecdh/sender.cc
index 1fb62a2..cae3519 100644
--- a/psi/ecdh/sender.cc
+++ b/psi/algorithm/ecdh/sender.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/ecdh/sender.h"
+#include "psi/algorithm/ecdh/sender.h"
#include
@@ -22,8 +22,8 @@
#include "yacl/base/exception.h"
#include "yacl/utils/scope_guard.h"
+#include "psi/algorithm/ecdh/common.h"
#include "psi/cryptor/cryptor_selector.h"
-#include "psi/ecdh/common.h"
#include "psi/trace_categories.h"
#include "psi/utils/sync.h"
diff --git a/psi/ecdh/sender.h b/psi/algorithm/ecdh/sender.h
similarity index 96%
rename from psi/ecdh/sender.h
rename to psi/algorithm/ecdh/sender.h
index e439d78..7050ede 100644
--- a/psi/ecdh/sender.h
+++ b/psi/algorithm/ecdh/sender.h
@@ -13,7 +13,7 @@
// limitations under the License.
#pragma once
-#include "psi/ecdh/ecdh_psi.h"
+#include "psi/algorithm/ecdh/ecdh_psi.h"
#include "psi/interface.h"
#include "psi/utils/arrow_csv_batch_provider.h"
diff --git a/psi/ecdh/ub_psi/BUILD.bazel b/psi/algorithm/ecdh/ub_psi/BUILD.bazel
similarity index 90%
rename from psi/ecdh/ub_psi/BUILD.bazel
rename to psi/algorithm/ecdh/ub_psi/BUILD.bazel
index e7aba0d..d12f65a 100644
--- a/psi/ecdh/ub_psi/BUILD.bazel
+++ b/psi/algorithm/ecdh/ub_psi/BUILD.bazel
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-load("//bazel:psi.bzl", "psi_cc_binary", "psi_cc_library", "psi_cc_test")
+load("//bazel:psi.bzl", "psi_cc_library", "psi_cc_test")
package(default_visibility = ["//visibility:public"])
@@ -24,8 +24,8 @@ psi_cc_library(
linkopts = ["-ldl"],
deps = [
"//psi/cryptor:ecc_cryptor",
- "@com_github_openssl_openssl//:openssl",
- "@com_google_absl//absl/types:span",
+ "@abseil-cpp//absl/types:span",
+ "@openssl",
"@yacl//yacl/base:byte_container_view",
"@yacl//yacl/base:exception",
"@yacl//yacl/utils:parallel",
@@ -54,8 +54,8 @@ psi_cc_library(
":ecdh_oprf",
"//psi/cryptor:ecc_utils",
"//psi/cryptor:sm2_cryptor",
- "@com_github_microsoft_FourQlib//:FourQlib",
- "@com_google_absl//absl/types:span",
+ "@abseil-cpp//absl/types:span",
+ "@fourqlib//:FourQlib",
"@yacl//yacl/base:exception",
"@yacl//yacl/crypto/hash:blake3",
"@yacl//yacl/crypto/hash:hash_utils",
@@ -83,7 +83,7 @@ psi_cc_library(
"//psi/utils:communication",
"//psi/utils:ec_point_store",
"//psi/utils:ub_psi_cache",
- "@com_google_absl//absl/strings",
+ "@abseil-cpp//absl/strings",
"@yacl//yacl/base:exception",
"@yacl//yacl/crypto/rand",
"@yacl//yacl/link",
@@ -98,7 +98,7 @@ psi_cc_test(
":ecdh_oprf_psi",
"//psi/utils:batch_provider_impl",
"//psi/utils:test_utils",
- "@com_google_absl//absl/time",
+ "@abseil-cpp//absl/time",
"@yacl//yacl/crypto/rand",
"@yacl//yacl/crypto/tools:prg",
"@yacl//yacl/utils:scope_guard",
diff --git a/psi/ecdh/ub_psi/basic_ecdh_oprf.cc b/psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.cc
similarity index 99%
rename from psi/ecdh/ub_psi/basic_ecdh_oprf.cc
rename to psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.cc
index 2612c84..eb52041 100644
--- a/psi/ecdh/ub_psi/basic_ecdh_oprf.cc
+++ b/psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/ecdh/ub_psi/basic_ecdh_oprf.h"
+#include "psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.h"
#include
#include
diff --git a/psi/ecdh/ub_psi/basic_ecdh_oprf.h b/psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.h
similarity index 99%
rename from psi/ecdh/ub_psi/basic_ecdh_oprf.h
rename to psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.h
index 62a2cff..6655080 100644
--- a/psi/ecdh/ub_psi/basic_ecdh_oprf.h
+++ b/psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.h
@@ -26,9 +26,9 @@
#include "yacl/base/exception.h"
#include "yacl/crypto/hash/hash_interface.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf.h"
#include "psi/cryptor/ecc_cryptor.h"
#include "psi/cryptor/sm2_cryptor.h"
-#include "psi/ecdh/ub_psi/ecdh_oprf.h"
// 2HashDH Oprf
// F_k(x) = H2(x, H1(x)^k)
diff --git a/psi/ecdh/ub_psi/basic_ecdh_oprf_test.cc b/psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf_test.cc
similarity index 96%
rename from psi/ecdh/ub_psi/basic_ecdh_oprf_test.cc
rename to psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf_test.cc
index 6152bcd..2c5b542 100644
--- a/psi/ecdh/ub_psi/basic_ecdh_oprf_test.cc
+++ b/psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/ecdh/ub_psi/basic_ecdh_oprf.h"
+#include "psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.h"
#include
#include
@@ -28,7 +28,7 @@
#include "yacl/crypto/rand/rand.h"
#include "yacl/crypto/tools/prg.h"
-#include "psi/ecdh/ub_psi/ecdh_oprf_selector.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h"
namespace psi::ecdh {
struct TestParams {
diff --git a/psi/ecdh/ub_psi/client.cc b/psi/algorithm/ecdh/ub_psi/client.cc
similarity index 99%
rename from psi/ecdh/ub_psi/client.cc
rename to psi/algorithm/ecdh/ub_psi/client.cc
index d45aa83..3f170f4 100644
--- a/psi/ecdh/ub_psi/client.cc
+++ b/psi/algorithm/ecdh/ub_psi/client.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/ecdh/ub_psi/client.h"
+#include "psi/algorithm/ecdh/ub_psi/client.h"
#include
#include
diff --git a/psi/ecdh/ub_psi/client.h b/psi/algorithm/ecdh/ub_psi/client.h
similarity index 96%
rename from psi/ecdh/ub_psi/client.h
rename to psi/algorithm/ecdh/ub_psi/client.h
index f10cc34..6001bc1 100644
--- a/psi/ecdh/ub_psi/client.h
+++ b/psi/algorithm/ecdh/ub_psi/client.h
@@ -13,7 +13,7 @@
// limitations under the License.
#pragma once
-#include "psi/ecdh/ub_psi/ecdh_oprf_psi.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.h"
#include "psi/interface.h"
#include "psi/utils/resource_manager.h"
diff --git a/psi/ecdh/ub_psi/ecdh_oprf.cc b/psi/algorithm/ecdh/ub_psi/ecdh_oprf.cc
similarity index 98%
rename from psi/ecdh/ub_psi/ecdh_oprf.cc
rename to psi/algorithm/ecdh/ub_psi/ecdh_oprf.cc
index 4f19b73..b218987 100644
--- a/psi/ecdh/ub_psi/ecdh_oprf.cc
+++ b/psi/algorithm/ecdh/ub_psi/ecdh_oprf.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/ecdh/ub_psi/ecdh_oprf.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf.h"
#include
#include
diff --git a/psi/ecdh/ub_psi/ecdh_oprf.h b/psi/algorithm/ecdh/ub_psi/ecdh_oprf.h
similarity index 100%
rename from psi/ecdh/ub_psi/ecdh_oprf.h
rename to psi/algorithm/ecdh/ub_psi/ecdh_oprf.h
diff --git a/psi/ecdh/ub_psi/ecdh_oprf_psi.cc b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.cc
similarity index 99%
rename from psi/ecdh/ub_psi/ecdh_oprf_psi.cc
rename to psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.cc
index 5c67cfb..b78723b 100644
--- a/psi/ecdh/ub_psi/ecdh_oprf_psi.cc
+++ b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/ecdh/ub_psi/ecdh_oprf_psi.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.h"
#include
#include
@@ -34,8 +34,8 @@
#include "yacl/crypto/rand/rand.h"
#include "yacl/utils/parallel.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h"
#include "psi/cryptor/ecc_utils.h"
-#include "psi/ecdh/ub_psi/ecdh_oprf_selector.h"
#include "psi/utils/communication.h"
#include "psi/utils/serialize.h"
diff --git a/psi/ecdh/ub_psi/ecdh_oprf_psi.h b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.h
similarity index 98%
rename from psi/ecdh/ub_psi/ecdh_oprf_psi.h
rename to psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.h
index 404a534..e6e0aa4 100644
--- a/psi/ecdh/ub_psi/ecdh_oprf_psi.h
+++ b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.h
@@ -26,8 +26,8 @@
#include "yacl/base/byte_container_view.h"
#include "yacl/link/link.h"
-#include "psi/ecdh/ub_psi/ecdh_oprf.h"
-#include "psi/ecdh/ub_psi/ecdh_oprf_selector.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h"
#include "psi/utils/batch_provider.h"
#include "psi/utils/ec_point_store.h"
#include "psi/utils/ub_psi_cache.h"
diff --git a/psi/ecdh/ub_psi/ecdh_oprf_psi_test.cc b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi_test.cc
similarity index 99%
rename from psi/ecdh/ub_psi/ecdh_oprf_psi_test.cc
rename to psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi_test.cc
index e7c93fe..aaa016e 100644
--- a/psi/ecdh/ub_psi/ecdh_oprf_psi_test.cc
+++ b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/ecdh/ub_psi/ecdh_oprf_psi.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.h"
#include
#include
@@ -31,7 +31,7 @@
#include "yacl/link/test_util.h"
#include "yacl/utils/scope_guard.h"
-#include "psi/ecdh/ub_psi/ecdh_oprf_selector.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h"
#include "psi/utils/arrow_csv_batch_provider.h"
#include "psi/utils/batch_provider_impl.h"
#include "psi/utils/ec_point_store.h"
diff --git a/psi/ecdh/ub_psi/ecdh_oprf_selector.cc b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.cc
similarity index 96%
rename from psi/ecdh/ub_psi/ecdh_oprf_selector.cc
rename to psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.cc
index 2d523cf..af1cfcb 100644
--- a/psi/ecdh/ub_psi/ecdh_oprf_selector.cc
+++ b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.cc
@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/ecdh/ub_psi/ecdh_oprf_selector.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h"
#include "yacl/utils/platform_utils.h"
-#include "psi/ecdh/ub_psi/basic_ecdh_oprf.h"
-#include "psi/ecdh/ub_psi/ecdh_oprf.h"
+#include "psi/algorithm/ecdh/ub_psi/basic_ecdh_oprf.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf.h"
namespace psi::ecdh {
diff --git a/psi/ecdh/ub_psi/ecdh_oprf_selector.h b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h
similarity index 96%
rename from psi/ecdh/ub_psi/ecdh_oprf_selector.h
rename to psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h
index a48be4a..776addb 100644
--- a/psi/ecdh/ub_psi/ecdh_oprf_selector.h
+++ b/psi/algorithm/ecdh/ub_psi/ecdh_oprf_selector.h
@@ -16,7 +16,7 @@
#include
-#include "psi/ecdh/ub_psi/ecdh_oprf.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf.h"
namespace psi::ecdh {
diff --git a/psi/ecdh/ub_psi/server.cc b/psi/algorithm/ecdh/ub_psi/server.cc
similarity index 99%
rename from psi/ecdh/ub_psi/server.cc
rename to psi/algorithm/ecdh/ub_psi/server.cc
index 0ea7aa1..9b65665 100644
--- a/psi/ecdh/ub_psi/server.cc
+++ b/psi/algorithm/ecdh/ub_psi/server.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/ecdh/ub_psi/server.h"
+#include "psi/algorithm/ecdh/ub_psi/server.h"
#include
diff --git a/psi/ecdh/ub_psi/server.h b/psi/algorithm/ecdh/ub_psi/server.h
similarity index 97%
rename from psi/ecdh/ub_psi/server.h
rename to psi/algorithm/ecdh/ub_psi/server.h
index 7a08c33..05114d9 100644
--- a/psi/ecdh/ub_psi/server.h
+++ b/psi/algorithm/ecdh/ub_psi/server.h
@@ -13,7 +13,7 @@
// limitations under the License.
#pragma once
-#include "psi/ecdh/ub_psi/ecdh_oprf_psi.h"
+#include "psi/algorithm/ecdh/ub_psi/ecdh_oprf_psi.h"
#include "psi/interface.h"
#include "psi/utils/arrow_csv_batch_provider.h"
#include "psi/utils/join_processor.h"
diff --git a/psi/kkrt/BUILD.bazel b/psi/algorithm/kkrt/BUILD.bazel
similarity index 98%
rename from psi/kkrt/BUILD.bazel
rename to psi/algorithm/kkrt/BUILD.bazel
index 0e5784c..deeecaa 100644
--- a/psi/kkrt/BUILD.bazel
+++ b/psi/algorithm/kkrt/BUILD.bazel
@@ -25,7 +25,7 @@ psi_cc_library(
"//psi/utils:communication",
"//psi/utils:cuckoo_index",
"//psi/utils:serialize",
- "@com_google_absl//absl/strings",
+ "@abseil-cpp//absl/strings",
"@yacl//yacl/crypto/hash:hash_utils",
"@yacl//yacl/crypto/rand",
"@yacl//yacl/kernel/algorithms:base_ot",
diff --git a/psi/kkrt/common.cc b/psi/algorithm/kkrt/common.cc
similarity index 94%
rename from psi/kkrt/common.cc
rename to psi/algorithm/kkrt/common.cc
index 6eaa87f..5dcf0d4 100644
--- a/psi/kkrt/common.cc
+++ b/psi/algorithm/kkrt/common.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/kkrt/common.h"
+#include "psi/algorithm/kkrt/common.h"
#include "psi/utils/bucket.h"
diff --git a/psi/kkrt/common.h b/psi/algorithm/kkrt/common.h
similarity index 100%
rename from psi/kkrt/common.h
rename to psi/algorithm/kkrt/common.h
diff --git a/psi/kkrt/kkrt_psi.cc b/psi/algorithm/kkrt/kkrt_psi.cc
similarity index 99%
rename from psi/kkrt/kkrt_psi.cc
rename to psi/algorithm/kkrt/kkrt_psi.cc
index 1f47a69..4c9376c 100644
--- a/psi/kkrt/kkrt_psi.cc
+++ b/psi/algorithm/kkrt/kkrt_psi.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/kkrt/kkrt_psi.h"
+#include "psi/algorithm/kkrt/kkrt_psi.h"
#include
#include
diff --git a/psi/kkrt/kkrt_psi.h b/psi/algorithm/kkrt/kkrt_psi.h
similarity index 100%
rename from psi/kkrt/kkrt_psi.h
rename to psi/algorithm/kkrt/kkrt_psi.h
diff --git a/psi/kkrt/kkrt_psi_benchmark.cc b/psi/algorithm/kkrt/kkrt_psi_benchmark.cc
similarity index 98%
rename from psi/kkrt/kkrt_psi_benchmark.cc
rename to psi/algorithm/kkrt/kkrt_psi_benchmark.cc
index 60e2f24..0eedb74 100644
--- a/psi/kkrt/kkrt_psi_benchmark.cc
+++ b/psi/algorithm/kkrt/kkrt_psi_benchmark.cc
@@ -20,7 +20,7 @@
#include "yacl/crypto/hash/hash_utils.h"
#include "yacl/link/test_util.h"
-#include "psi/kkrt/kkrt_psi.h"
+#include "psi/algorithm/kkrt/kkrt_psi.h"
namespace {
std::vector CreateRangeItems(size_t begin, size_t size) {
diff --git a/psi/kkrt/kkrt_psi_test.cc b/psi/algorithm/kkrt/kkrt_psi_test.cc
similarity index 99%
rename from psi/kkrt/kkrt_psi_test.cc
rename to psi/algorithm/kkrt/kkrt_psi_test.cc
index 20327e8..dc49f74 100644
--- a/psi/kkrt/kkrt_psi_test.cc
+++ b/psi/algorithm/kkrt/kkrt_psi_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/kkrt/kkrt_psi.h"
+#include "psi/algorithm/kkrt/kkrt_psi.h"
#include
#include
diff --git a/psi/kkrt/receiver.cc b/psi/algorithm/kkrt/receiver.cc
similarity index 97%
rename from psi/kkrt/receiver.cc
rename to psi/algorithm/kkrt/receiver.cc
index aa35457..e91aa3d 100644
--- a/psi/kkrt/receiver.cc
+++ b/psi/algorithm/kkrt/receiver.cc
@@ -12,13 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/kkrt/receiver.h"
+#include "psi/algorithm/kkrt/receiver.h"
#include "yacl/crypto/hash/hash_utils.h"
#include "yacl/utils/parallel.h"
-#include "psi/kkrt/common.h"
-#include "psi/kkrt/kkrt_psi.h"
+#include "psi/algorithm/kkrt/common.h"
+#include "psi/algorithm/kkrt/kkrt_psi.h"
#include "psi/legacy/bucket_psi.h"
#include "psi/prelude.h"
#include "psi/trace_categories.h"
diff --git a/psi/kkrt/receiver.h b/psi/algorithm/kkrt/receiver.h
similarity index 100%
rename from psi/kkrt/receiver.h
rename to psi/algorithm/kkrt/receiver.h
diff --git a/psi/kkrt/sender.cc b/psi/algorithm/kkrt/sender.cc
similarity index 97%
rename from psi/kkrt/sender.cc
rename to psi/algorithm/kkrt/sender.cc
index 6e70bee..179bb25 100644
--- a/psi/kkrt/sender.cc
+++ b/psi/algorithm/kkrt/sender.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/kkrt/sender.h"
+#include "psi/algorithm/kkrt/sender.h"
#include
#include
@@ -21,8 +21,8 @@
#include "yacl/crypto/hash/hash_utils.h"
#include "yacl/utils/parallel.h"
-#include "psi/kkrt/common.h"
-#include "psi/kkrt/kkrt_psi.h"
+#include "psi/algorithm/kkrt/common.h"
+#include "psi/algorithm/kkrt/kkrt_psi.h"
#include "psi/legacy/bucket_psi.h"
#include "psi/prelude.h"
#include "psi/trace_categories.h"
diff --git a/psi/kkrt/sender.h b/psi/algorithm/kkrt/sender.h
similarity index 100%
rename from psi/kkrt/sender.h
rename to psi/algorithm/kkrt/sender.h
diff --git a/psi/kwpir/BUILD.bazel b/psi/algorithm/kwpir/BUILD.bazel
similarity index 96%
rename from psi/kwpir/BUILD.bazel
rename to psi/algorithm/kwpir/BUILD.bazel
index 6225898..83cb454 100644
--- a/psi/kwpir/BUILD.bazel
+++ b/psi/algorithm/kwpir/BUILD.bazel
@@ -41,6 +41,6 @@ psi_cc_test(
srcs = ["kw_pir_test.cc"],
deps = [
":kw_pir",
- "//psi/sealpir:seal_pir",
+ "//psi/algorithm/sealpir:seal_pir",
],
)
diff --git a/psi/kwpir/index_pir.h b/psi/algorithm/kwpir/index_pir.h
similarity index 100%
rename from psi/kwpir/index_pir.h
rename to psi/algorithm/kwpir/index_pir.h
diff --git a/psi/kwpir/kw_pir.cc b/psi/algorithm/kwpir/kw_pir.cc
similarity index 98%
rename from psi/kwpir/kw_pir.cc
rename to psi/algorithm/kwpir/kw_pir.cc
index 10ec68c..f63cd98 100644
--- a/psi/kwpir/kw_pir.cc
+++ b/psi/algorithm/kwpir/kw_pir.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/kwpir/kw_pir.h"
+#include "psi/algorithm/kwpir/kw_pir.h"
#include "yacl/crypto/hash/hash_utils.h"
@@ -131,4 +131,4 @@ std::vector> KwPirClient::DecodeReply(
}
return ele_vec;
}
-} // namespace psi::kwpir
\ No newline at end of file
+} // namespace psi::kwpir
diff --git a/psi/kwpir/kw_pir.h b/psi/algorithm/kwpir/kw_pir.h
similarity index 97%
rename from psi/kwpir/kw_pir.h
rename to psi/algorithm/kwpir/kw_pir.h
index 7e6817e..84d0777 100644
--- a/psi/kwpir/kw_pir.h
+++ b/psi/algorithm/kwpir/kw_pir.h
@@ -24,7 +24,7 @@
#include "yacl/base/byte_container_view.h"
#include "yacl/crypto/rand/rand.h"
-#include "psi/kwpir/index_pir.h"
+#include "psi/algorithm/kwpir/index_pir.h"
#include "psi/utils/cuckoo_index.h"
namespace psi::kwpir {
@@ -88,4 +88,4 @@ class KwPirClient : public KwPir {
private:
std::unique_ptr pir_client_;
};
-} // namespace psi::kwpir
\ No newline at end of file
+} // namespace psi::kwpir
diff --git a/psi/kwpir/kw_pir_test.cc b/psi/algorithm/kwpir/kw_pir_test.cc
similarity index 98%
rename from psi/kwpir/kw_pir_test.cc
rename to psi/algorithm/kwpir/kw_pir_test.cc
index b0be22d..07f22fc 100644
--- a/psi/kwpir/kw_pir_test.cc
+++ b/psi/algorithm/kwpir/kw_pir_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/kwpir/kw_pir.h"
+#include "psi/algorithm/kwpir/kw_pir.h"
#include
#include
@@ -21,7 +21,7 @@
#include "spdlog/spdlog.h"
#include "yacl/crypto/hash/hash_utils.h"
-#include "psi/sealpir/seal_pir.h"
+#include "psi/algorithm/sealpir/seal_pir.h"
namespace psi::kwpir {
@@ -152,4 +152,4 @@ INSTANTIATE_TEST_SUITE_P(
TestParams{(1 << 22) - (1 << 10), 3, 1.3, 16, 64, 128, 4096,
2, 0}));
-} // namespace psi::kwpir
\ No newline at end of file
+} // namespace psi::kwpir
diff --git a/psi/rr22/BUILD.bazel b/psi/algorithm/rr22/BUILD.bazel
similarity index 93%
rename from psi/rr22/BUILD.bazel
rename to psi/algorithm/rr22/BUILD.bazel
index 0afc765..82be1c4 100644
--- a/psi/rr22/BUILD.bazel
+++ b/psi/algorithm/rr22/BUILD.bazel
@@ -46,8 +46,8 @@ psi_cc_library(
deps = [
":davis_meyer_hash",
":rr22_utils",
- "//psi/rr22/okvs:aes_crhash",
- "//psi/rr22/okvs:baxos",
+ "//psi/algorithm/rr22/okvs:aes_crhash",
+ "//psi/algorithm/rr22/okvs:baxos",
"@yacl//yacl/base:buffer",
"@yacl//yacl/crypto/rand",
"@yacl//yacl/crypto/tools:prg",
@@ -81,11 +81,11 @@ psi_cc_library(
hdrs = ["rr22_utils.h"],
deps = [
":sparsehash_config",
- "//psi/rr22/okvs:galois128",
- "//psi/rr22/okvs:simple_index",
+ "//psi/algorithm/rr22/okvs:galois128",
+ "//psi/algorithm/rr22/okvs:simple_index",
"//psi/utils:bucket",
- "@com_github_ridiculousfish_libdivide//:libdivide",
- "@com_github_sparsehash_sparsehash//:sparsehash",
+ "@libdivide",
+ "@sparsehash",
"@yacl//yacl/base:buffer",
"@yacl//yacl/base:int128",
"@yacl//yacl/link",
diff --git a/psi/rr22/common.cc b/psi/algorithm/rr22/common.cc
similarity index 96%
rename from psi/rr22/common.cc
rename to psi/algorithm/rr22/common.cc
index 80a4349..8f7fff6 100644
--- a/psi/rr22/common.cc
+++ b/psi/algorithm/rr22/common.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/common.h"
+#include "psi/algorithm/rr22/common.h"
#include "omp.h"
diff --git a/psi/rr22/common.h b/psi/algorithm/rr22/common.h
similarity index 91%
rename from psi/rr22/common.h
rename to psi/algorithm/rr22/common.h
index 2af8dd8..fdbad4c 100644
--- a/psi/rr22/common.h
+++ b/psi/algorithm/rr22/common.h
@@ -15,8 +15,8 @@
#include
-#include "psi/rr22/rr22_oprf.h"
-#include "psi/rr22/rr22_psi.h"
+#include "psi/algorithm/rr22/rr22_oprf.h"
+#include "psi/algorithm/rr22/rr22_psi.h"
#include "psi/utils/recovery.h"
#include "psi/proto/psi_v2.pb.h"
diff --git a/psi/rr22/davis_meyer_hash.cc b/psi/algorithm/rr22/davis_meyer_hash.cc
similarity index 98%
rename from psi/rr22/davis_meyer_hash.cc
rename to psi/algorithm/rr22/davis_meyer_hash.cc
index 919a894..47d78e1 100644
--- a/psi/rr22/davis_meyer_hash.cc
+++ b/psi/algorithm/rr22/davis_meyer_hash.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/davis_meyer_hash.h"
+#include "psi/algorithm/rr22/davis_meyer_hash.h"
#include "yacl/crypto/aes/aes_opt.h"
#include "yacl/crypto/block_cipher/symmetric_crypto.h"
diff --git a/psi/rr22/davis_meyer_hash.h b/psi/algorithm/rr22/davis_meyer_hash.h
similarity index 100%
rename from psi/rr22/davis_meyer_hash.h
rename to psi/algorithm/rr22/davis_meyer_hash.h
diff --git a/psi/rr22/davis_meyer_hash_test.cc b/psi/algorithm/rr22/davis_meyer_hash_test.cc
similarity index 96%
rename from psi/rr22/davis_meyer_hash_test.cc
rename to psi/algorithm/rr22/davis_meyer_hash_test.cc
index f601501..d34df6d 100644
--- a/psi/rr22/davis_meyer_hash_test.cc
+++ b/psi/algorithm/rr22/davis_meyer_hash_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/davis_meyer_hash.h"
+#include "psi/algorithm/rr22/davis_meyer_hash.h"
#include "gtest/gtest.h"
#include "spdlog/spdlog.h"
diff --git a/psi/rr22/okvs/BUILD.bazel b/psi/algorithm/rr22/okvs/BUILD.bazel
similarity index 88%
rename from psi/rr22/okvs/BUILD.bazel
rename to psi/algorithm/rr22/okvs/BUILD.bazel
index d8f7bbe..2598953 100644
--- a/psi/rr22/okvs/BUILD.bazel
+++ b/psi/algorithm/rr22/okvs/BUILD.bazel
@@ -35,7 +35,7 @@ psi_cc_test(
srcs = ["baxos_test.cc"],
deps = [
":baxos",
- "@com_google_absl//absl/strings",
+ "@abseil-cpp//absl/strings",
"@yacl//yacl/crypto/rand",
"@yacl//yacl/crypto/tools:prg",
],
@@ -60,7 +60,7 @@ psi_cc_test(
srcs = ["paxos_test.cc"],
deps = [
":paxos",
- "@com_google_absl//absl/strings",
+ "@abseil-cpp//absl/strings",
"@yacl//yacl/crypto/rand",
"@yacl//yacl/crypto/tools:prg",
],
@@ -77,8 +77,8 @@ psi_cc_library(
deps = [
":aes_crhash",
":galois128",
- "@com_github_ridiculousfish_libdivide//:libdivide",
- "@com_google_absl//absl/types:span",
+ "@abseil-cpp//absl/types:span",
+ "@libdivide",
"@yacl//yacl/math:gadget",
"@yacl//yacl/utils:platform_utils",
],
@@ -99,7 +99,7 @@ psi_cc_library(
hdrs = ["paxos_utils.h"],
deps = [
":galois128",
- "@com_google_absl//absl/types:span",
+ "@abseil-cpp//absl/types:span",
"@yacl//yacl/crypto/tools:prg",
],
)
@@ -120,7 +120,7 @@ psi_cc_test(
deps = [
":aes_crhash",
":galois128",
- "@com_google_absl//absl/strings",
+ "@abseil-cpp//absl/strings",
"@yacl//yacl/crypto/tools:prg",
],
)
@@ -136,7 +136,7 @@ psi_cc_library(
"//conditions:default": [],
}),
deps = [
- "@com_google_absl//absl/strings",
+ "@abseil-cpp//absl/strings",
"@yacl//yacl/base:block",
"@yacl//yacl/base:int128",
"@yacl//yacl/link",
@@ -149,7 +149,7 @@ psi_cc_test(
srcs = ["galois128_test.cc"],
deps = [
":galois128",
- "@com_google_absl//absl/strings",
+ "@abseil-cpp//absl/strings",
"@yacl//yacl/crypto/rand",
"@yacl//yacl/crypto/tools:prg",
],
@@ -161,7 +161,7 @@ psi_cc_library(
hdrs = ["dense_mtx.h"],
deps = [
":galois128",
- "@com_google_absl//absl/types:span",
+ "@abseil-cpp//absl/types:span",
"@yacl//yacl/crypto/tools:prg",
],
)
@@ -171,8 +171,8 @@ psi_cc_library(
srcs = ["simple_index.cc"],
hdrs = ["simple_index.h"],
deps = [
- "@boost//:math",
- "@boost//:multiprecision",
+ "@boost.math//:boost.math",
+ "@boost.multiprecision//:boost.multiprecision",
"@yacl//yacl/base:exception",
],
)
diff --git a/psi/rr22/okvs/aes_crhash.cc b/psi/algorithm/rr22/okvs/aes_crhash.cc
similarity index 98%
rename from psi/rr22/okvs/aes_crhash.cc
rename to psi/algorithm/rr22/okvs/aes_crhash.cc
index f425046..68f99f5 100644
--- a/psi/rr22/okvs/aes_crhash.cc
+++ b/psi/algorithm/rr22/okvs/aes_crhash.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/okvs/aes_crhash.h"
+#include "psi/algorithm/rr22/okvs/aes_crhash.h"
#include
diff --git a/psi/rr22/okvs/aes_crhash.h b/psi/algorithm/rr22/okvs/aes_crhash.h
similarity index 100%
rename from psi/rr22/okvs/aes_crhash.h
rename to psi/algorithm/rr22/okvs/aes_crhash.h
diff --git a/psi/rr22/okvs/aes_crhash_test.cc b/psi/algorithm/rr22/okvs/aes_crhash_test.cc
similarity index 95%
rename from psi/rr22/okvs/aes_crhash_test.cc
rename to psi/algorithm/rr22/okvs/aes_crhash_test.cc
index f42963f..b13808a 100644
--- a/psi/rr22/okvs/aes_crhash_test.cc
+++ b/psi/algorithm/rr22/okvs/aes_crhash_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/okvs/aes_crhash.h"
+#include "psi/algorithm/rr22/okvs/aes_crhash.h"
#include
#include
@@ -22,7 +22,7 @@
#include "spdlog/spdlog.h"
#include "yacl/crypto/tools/prg.h"
-#include "psi/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
namespace psi::rr22::okvs {
diff --git a/psi/rr22/okvs/baxos.cc b/psi/algorithm/rr22/okvs/baxos.cc
similarity index 99%
rename from psi/rr22/okvs/baxos.cc
rename to psi/algorithm/rr22/okvs/baxos.cc
index c2ff514..7c7bca0 100644
--- a/psi/rr22/okvs/baxos.cc
+++ b/psi/algorithm/rr22/okvs/baxos.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/okvs/baxos.h"
+#include "psi/algorithm/rr22/okvs/baxos.h"
#include
#include
@@ -22,7 +22,7 @@
#include "spdlog/spdlog.h"
-#include "psi/rr22/okvs/simple_index.h"
+#include "psi/algorithm/rr22/okvs/simple_index.h"
namespace psi::rr22::okvs {
diff --git a/psi/rr22/okvs/baxos.h b/psi/algorithm/rr22/okvs/baxos.h
similarity index 96%
rename from psi/rr22/okvs/baxos.h
rename to psi/algorithm/rr22/okvs/baxos.h
index 02d8cdd..e89d27c 100644
--- a/psi/rr22/okvs/baxos.h
+++ b/psi/algorithm/rr22/okvs/baxos.h
@@ -20,10 +20,10 @@
#include "absl/types/span.h"
-#include "psi/rr22/okvs/dense_mtx.h"
-#include "psi/rr22/okvs/galois128.h"
-#include "psi/rr22/okvs/paxos.h"
-#include "psi/rr22/okvs/paxos_utils.h"
+#include "psi/algorithm/rr22/okvs/dense_mtx.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/paxos.h"
+#include "psi/algorithm/rr22/okvs/paxos_utils.h"
namespace psi::rr22::okvs {
diff --git a/psi/rr22/okvs/baxos_test.cc b/psi/algorithm/rr22/okvs/baxos_test.cc
similarity index 98%
rename from psi/rr22/okvs/baxos_test.cc
rename to psi/algorithm/rr22/okvs/baxos_test.cc
index 573f8af..14b04b1 100644
--- a/psi/rr22/okvs/baxos_test.cc
+++ b/psi/algorithm/rr22/okvs/baxos_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/okvs/baxos.h"
+#include "psi/algorithm/rr22/okvs/baxos.h"
#include
#include
diff --git a/psi/rr22/okvs/dense_mtx.cc b/psi/algorithm/rr22/okvs/dense_mtx.cc
similarity index 98%
rename from psi/rr22/okvs/dense_mtx.cc
rename to psi/algorithm/rr22/okvs/dense_mtx.cc
index 6735c43..14bd158 100644
--- a/psi/rr22/okvs/dense_mtx.cc
+++ b/psi/algorithm/rr22/okvs/dense_mtx.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/okvs/dense_mtx.h"
+#include "psi/algorithm/rr22/okvs/dense_mtx.h"
#include
diff --git a/psi/rr22/okvs/dense_mtx.h b/psi/algorithm/rr22/okvs/dense_mtx.h
similarity index 99%
rename from psi/rr22/okvs/dense_mtx.h
rename to psi/algorithm/rr22/okvs/dense_mtx.h
index 4a2c21e..00eacb8 100644
--- a/psi/rr22/okvs/dense_mtx.h
+++ b/psi/algorithm/rr22/okvs/dense_mtx.h
@@ -19,7 +19,7 @@
#include "yacl/base/exception.h"
-#include "psi/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
namespace psi::rr22::okvs {
diff --git a/psi/rr22/okvs/galois128.cc b/psi/algorithm/rr22/okvs/galois128.cc
similarity index 98%
rename from psi/rr22/okvs/galois128.cc
rename to psi/algorithm/rr22/okvs/galois128.cc
index 181f297..effbd60 100644
--- a/psi/rr22/okvs/galois128.cc
+++ b/psi/algorithm/rr22/okvs/galois128.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
#include
@@ -21,7 +21,7 @@
#include "yacl/utils/platform_utils.h"
#ifdef __x86_64__
-#include "cpu_features/cpuinfo_x86.h"
+#include "cpuinfo_x86.h"
#endif
namespace psi::rr22::okvs {
diff --git a/psi/rr22/okvs/galois128.h b/psi/algorithm/rr22/okvs/galois128.h
similarity index 100%
rename from psi/rr22/okvs/galois128.h
rename to psi/algorithm/rr22/okvs/galois128.h
diff --git a/psi/rr22/okvs/galois128_test.cc b/psi/algorithm/rr22/okvs/galois128_test.cc
similarity index 97%
rename from psi/rr22/okvs/galois128_test.cc
rename to psi/algorithm/rr22/okvs/galois128_test.cc
index 04e78a9..e710b49 100644
--- a/psi/rr22/okvs/galois128_test.cc
+++ b/psi/algorithm/rr22/okvs/galois128_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
#include
diff --git a/psi/rr22/okvs/paxos.cc b/psi/algorithm/rr22/okvs/paxos.cc
similarity index 99%
rename from psi/rr22/okvs/paxos.cc
rename to psi/algorithm/rr22/okvs/paxos.cc
index a01907d..60302d6 100644
--- a/psi/rr22/okvs/paxos.cc
+++ b/psi/algorithm/rr22/okvs/paxos.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/okvs/paxos.h"
+#include "psi/algorithm/rr22/okvs/paxos.h"
#include
#include
diff --git a/psi/rr22/okvs/paxos.h b/psi/algorithm/rr22/okvs/paxos.h
similarity index 98%
rename from psi/rr22/okvs/paxos.h
rename to psi/algorithm/rr22/okvs/paxos.h
index 2ec657f..43e5e28 100644
--- a/psi/rr22/okvs/paxos.h
+++ b/psi/algorithm/rr22/okvs/paxos.h
@@ -22,10 +22,10 @@
#include "libdivide.h"
#include "yacl/utils/platform_utils.h"
-#include "psi/rr22/okvs/dense_mtx.h"
-#include "psi/rr22/okvs/galois128.h"
-#include "psi/rr22/okvs/paxos_hash.h"
-#include "psi/rr22/okvs/paxos_utils.h"
+#include "psi/algorithm/rr22/okvs/dense_mtx.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/paxos_hash.h"
+#include "psi/algorithm/rr22/okvs/paxos_utils.h"
namespace psi::rr22::okvs {
diff --git a/psi/rr22/okvs/paxos_hash.cc b/psi/algorithm/rr22/okvs/paxos_hash.cc
similarity index 99%
rename from psi/rr22/okvs/paxos_hash.cc
rename to psi/algorithm/rr22/okvs/paxos_hash.cc
index 975963f..887fff5 100644
--- a/psi/rr22/okvs/paxos_hash.cc
+++ b/psi/algorithm/rr22/okvs/paxos_hash.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/okvs/paxos_hash.h"
+#include "psi/algorithm/rr22/okvs/paxos_hash.h"
#include
diff --git a/psi/rr22/okvs/paxos_hash.h b/psi/algorithm/rr22/okvs/paxos_hash.h
similarity index 98%
rename from psi/rr22/okvs/paxos_hash.h
rename to psi/algorithm/rr22/okvs/paxos_hash.h
index 79d4b16..4d60bfd 100644
--- a/psi/rr22/okvs/paxos_hash.h
+++ b/psi/algorithm/rr22/okvs/paxos_hash.h
@@ -23,8 +23,8 @@
#include "yacl/math/gadget.h"
#include "yacl/utils/platform_utils.h"
-#include "psi/rr22/okvs/aes_crhash.h"
-#include "psi/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/aes_crhash.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
namespace psi::rr22::okvs {
diff --git a/psi/rr22/okvs/paxos_hash_test.cc b/psi/algorithm/rr22/okvs/paxos_hash_test.cc
similarity index 97%
rename from psi/rr22/okvs/paxos_hash_test.cc
rename to psi/algorithm/rr22/okvs/paxos_hash_test.cc
index b109cb8..ea4bd74 100644
--- a/psi/rr22/okvs/paxos_hash_test.cc
+++ b/psi/algorithm/rr22/okvs/paxos_hash_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/okvs/paxos_hash.h"
+#include "psi/algorithm/rr22/okvs/paxos_hash.h"
#include "gtest/gtest.h"
#include "spdlog/spdlog.h"
diff --git a/psi/rr22/okvs/paxos_test.cc b/psi/algorithm/rr22/okvs/paxos_test.cc
similarity index 98%
rename from psi/rr22/okvs/paxos_test.cc
rename to psi/algorithm/rr22/okvs/paxos_test.cc
index 623c966..d892652 100644
--- a/psi/rr22/okvs/paxos_test.cc
+++ b/psi/algorithm/rr22/okvs/paxos_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/okvs/paxos.h"
+#include "psi/algorithm/rr22/okvs/paxos.h"
#include "absl/strings/escaping.h"
#include "gtest/gtest.h"
diff --git a/psi/rr22/okvs/paxos_utils.cc b/psi/algorithm/rr22/okvs/paxos_utils.cc
similarity index 92%
rename from psi/rr22/okvs/paxos_utils.cc
rename to psi/algorithm/rr22/okvs/paxos_utils.cc
index 4eede9e..2127b83 100644
--- a/psi/rr22/okvs/paxos_utils.cc
+++ b/psi/algorithm/rr22/okvs/paxos_utils.cc
@@ -12,6 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/okvs/paxos_utils.h"
+#include "psi/algorithm/rr22/okvs/paxos_utils.h"
namespace psi::rr22::okvs {}
diff --git a/psi/rr22/okvs/paxos_utils.h b/psi/algorithm/rr22/okvs/paxos_utils.h
similarity index 99%
rename from psi/rr22/okvs/paxos_utils.h
rename to psi/algorithm/rr22/okvs/paxos_utils.h
index 9bd7112..654a30a 100644
--- a/psi/rr22/okvs/paxos_utils.h
+++ b/psi/algorithm/rr22/okvs/paxos_utils.h
@@ -22,7 +22,7 @@
#include "absl/types/span.h"
#include "yacl/crypto/tools/prg.h"
-#include "psi/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
namespace psi::rr22::okvs {
diff --git a/psi/rr22/okvs/simple_index.cc b/psi/algorithm/rr22/okvs/simple_index.cc
similarity index 99%
rename from psi/rr22/okvs/simple_index.cc
rename to psi/algorithm/rr22/okvs/simple_index.cc
index 15883d3..58c4a51 100644
--- a/psi/rr22/okvs/simple_index.cc
+++ b/psi/algorithm/rr22/okvs/simple_index.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/okvs/simple_index.h"
+#include "psi/algorithm/rr22/okvs/simple_index.h"
#include
#include
diff --git a/psi/rr22/okvs/simple_index.h b/psi/algorithm/rr22/okvs/simple_index.h
similarity index 100%
rename from psi/rr22/okvs/simple_index.h
rename to psi/algorithm/rr22/okvs/simple_index.h
diff --git a/psi/rr22/receiver.cc b/psi/algorithm/rr22/receiver.cc
similarity index 96%
rename from psi/rr22/receiver.cc
rename to psi/algorithm/rr22/receiver.cc
index b7e5285..4c235cc 100644
--- a/psi/rr22/receiver.cc
+++ b/psi/algorithm/rr22/receiver.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/receiver.h"
+#include "psi/algorithm/rr22/receiver.h"
#include
#include
@@ -24,11 +24,11 @@
#include "yacl/crypto/rand/rand.h"
#include "yacl/utils/parallel.h"
+#include "psi/algorithm/rr22/common.h"
+#include "psi/algorithm/rr22/rr22_psi.h"
+#include "psi/algorithm/rr22/rr22_utils.h"
#include "psi/legacy/bucket_psi.h"
#include "psi/prelude.h"
-#include "psi/rr22/common.h"
-#include "psi/rr22/rr22_psi.h"
-#include "psi/rr22/rr22_utils.h"
#include "psi/trace_categories.h"
#include "psi/utils/bucket.h"
#include "psi/utils/serialize.h"
diff --git a/psi/rr22/receiver.h b/psi/algorithm/rr22/receiver.h
similarity index 96%
rename from psi/rr22/receiver.h
rename to psi/algorithm/rr22/receiver.h
index b9efa0f..ab06fbe 100644
--- a/psi/rr22/receiver.h
+++ b/psi/algorithm/rr22/receiver.h
@@ -13,8 +13,8 @@
// limitations under the License.
#pragma once
+#include "psi/algorithm/rr22/rr22_psi.h"
#include "psi/interface.h"
-#include "psi/rr22/rr22_psi.h"
#include "psi/utils/hash_bucket_cache.h"
#include "psi/proto/psi_v2.pb.h"
diff --git a/psi/rr22/rr22_oprf.cc b/psi/algorithm/rr22/rr22_oprf.cc
similarity index 99%
rename from psi/rr22/rr22_oprf.cc
rename to psi/algorithm/rr22/rr22_oprf.cc
index 49645db..9812812 100644
--- a/psi/rr22/rr22_oprf.cc
+++ b/psi/algorithm/rr22/rr22_oprf.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/rr22_oprf.h"
+#include "psi/algorithm/rr22/rr22_oprf.h"
#include
#include
@@ -29,9 +29,9 @@
#include "yacl/math/galois_field/gf_intrinsic.h"
#include "yacl/utils/parallel.h"
-#include "psi/rr22/davis_meyer_hash.h"
-#include "psi/rr22/okvs/galois128.h"
-#include "psi/rr22/rr22_utils.h"
+#include "psi/algorithm/rr22/davis_meyer_hash.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/rr22_utils.h"
namespace psi::rr22 {
diff --git a/psi/rr22/rr22_oprf.h b/psi/algorithm/rr22/rr22_oprf.h
similarity index 99%
rename from psi/rr22/rr22_oprf.h
rename to psi/algorithm/rr22/rr22_oprf.h
index db436dc..ecdc4f9 100644
--- a/psi/rr22/rr22_oprf.h
+++ b/psi/algorithm/rr22/rr22_oprf.h
@@ -22,7 +22,7 @@
#include "yacl/kernel/algorithms/silent_vole.h"
#include "yacl/link/context.h"
-#include "psi/rr22/okvs/baxos.h"
+#include "psi/algorithm/rr22/okvs/baxos.h"
// Reference:
// Blazing Fast PSI from Improved OKVS and Subfield VOLE
diff --git a/psi/rr22/rr22_oprf_test.cc b/psi/algorithm/rr22/rr22_oprf_test.cc
similarity index 97%
rename from psi/rr22/rr22_oprf_test.cc
rename to psi/algorithm/rr22/rr22_oprf_test.cc
index 0da6774..798b970 100644
--- a/psi/rr22/rr22_oprf_test.cc
+++ b/psi/algorithm/rr22/rr22_oprf_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/rr22_oprf.h"
+#include "psi/algorithm/rr22/rr22_oprf.h"
#include
#include
@@ -21,7 +21,7 @@
#include "yacl/crypto/tools/prg.h"
#include "yacl/link/test_util.h"
-#include "psi/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
namespace psi::rr22 {
diff --git a/psi/rr22/rr22_psi.cc b/psi/algorithm/rr22/rr22_psi.cc
similarity index 96%
rename from psi/rr22/rr22_psi.cc
rename to psi/algorithm/rr22/rr22_psi.cc
index d33f40e..9553435 100644
--- a/psi/rr22/rr22_psi.cc
+++ b/psi/algorithm/rr22/rr22_psi.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/rr22_psi.h"
+#include "psi/algorithm/rr22/rr22_psi.h"
#include
#include
@@ -28,9 +28,9 @@
#include "yacl/base/byte_container_view.h"
#include "yacl/utils/parallel.h"
-#include "psi/rr22/okvs/galois128.h"
-#include "psi/rr22/rr22_oprf.h"
-#include "psi/rr22/rr22_utils.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/rr22_oprf.h"
+#include "psi/algorithm/rr22/rr22_utils.h"
#include "psi/utils/bucket.h"
#include "psi/utils/sync.h"
diff --git a/psi/rr22/rr22_psi.h b/psi/algorithm/rr22/rr22_psi.h
similarity index 99%
rename from psi/rr22/rr22_psi.h
rename to psi/algorithm/rr22/rr22_psi.h
index 6375bac..d940ad7 100644
--- a/psi/rr22/rr22_psi.h
+++ b/psi/algorithm/rr22/rr22_psi.h
@@ -31,7 +31,7 @@
#include "yacl/base/int128.h"
#include "yacl/link/context.h"
-#include "psi/rr22/rr22_oprf.h"
+#include "psi/algorithm/rr22/rr22_oprf.h"
#include "psi/utils/bucket.h"
#include "psi/utils/hash_bucket_cache.h"
diff --git a/psi/rr22/rr22_psi_benchmark.cc b/psi/algorithm/rr22/rr22_psi_benchmark.cc
similarity index 98%
rename from psi/rr22/rr22_psi_benchmark.cc
rename to psi/algorithm/rr22/rr22_psi_benchmark.cc
index 4e633de..5b5c493 100644
--- a/psi/rr22/rr22_psi_benchmark.cc
+++ b/psi/algorithm/rr22/rr22_psi_benchmark.cc
@@ -27,9 +27,9 @@
#include "yacl/link/context.h"
#include "yacl/link/test_util.h"
-#include "psi/rr22/rr22_oprf.h"
-#include "psi/rr22/rr22_psi.h"
-#include "psi/rr22/rr22_utils.h"
+#include "psi/algorithm/rr22/rr22_oprf.h"
+#include "psi/algorithm/rr22/rr22_psi.h"
+#include "psi/algorithm/rr22/rr22_utils.h"
namespace {
diff --git a/psi/rr22/rr22_psi_test.cc b/psi/algorithm/rr22/rr22_psi_test.cc
similarity index 98%
rename from psi/rr22/rr22_psi_test.cc
rename to psi/algorithm/rr22/rr22_psi_test.cc
index b2e12a6..c28fb3a 100644
--- a/psi/rr22/rr22_psi_test.cc
+++ b/psi/algorithm/rr22/rr22_psi_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/rr22_psi.h"
+#include "psi/algorithm/rr22/rr22_psi.h"
#include
#include
@@ -28,7 +28,7 @@
#include "yacl/crypto/tools/prg.h"
#include "yacl/link/test_util.h"
-#include "psi/rr22/rr22_utils.h"
+#include "psi/algorithm/rr22/rr22_utils.h"
#include "psi/utils/hash_bucket_cache.h"
namespace psi::rr22 {
diff --git a/psi/rr22/rr22_utils.cc b/psi/algorithm/rr22/rr22_utils.cc
similarity index 98%
rename from psi/rr22/rr22_utils.cc
rename to psi/algorithm/rr22/rr22_utils.cc
index 6528106..839ac2b 100644
--- a/psi/rr22/rr22_utils.cc
+++ b/psi/algorithm/rr22/rr22_utils.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/rr22_utils.h"
+#include "psi/algorithm/rr22/rr22_utils.h"
#include
#include
@@ -29,8 +29,8 @@
#include "sparsehash/dense_hash_map"
#include "yacl/utils/parallel.h"
-#include "psi/rr22/okvs/galois128.h"
-#include "psi/rr22/okvs/simple_index.h"
+#include "psi/algorithm/rr22/okvs/galois128.h"
+#include "psi/algorithm/rr22/okvs/simple_index.h"
#include "psi/utils/serialize.h"
namespace psi::rr22 {
diff --git a/psi/rr22/rr22_utils.h b/psi/algorithm/rr22/rr22_utils.h
similarity index 100%
rename from psi/rr22/rr22_utils.h
rename to psi/algorithm/rr22/rr22_utils.h
diff --git a/psi/rr22/sender.cc b/psi/algorithm/rr22/sender.cc
similarity index 96%
rename from psi/rr22/sender.cc
rename to psi/algorithm/rr22/sender.cc
index c40147d..b69f6ce 100644
--- a/psi/rr22/sender.cc
+++ b/psi/algorithm/rr22/sender.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/rr22/sender.h"
+#include "psi/algorithm/rr22/sender.h"
#include
#include
@@ -20,10 +20,10 @@
#include "yacl/crypto/hash/hash_utils.h"
#include "yacl/utils/parallel.h"
+#include "psi/algorithm/rr22/common.h"
+#include "psi/algorithm/rr22/rr22_psi.h"
+#include "psi/algorithm/rr22/rr22_utils.h"
#include "psi/legacy/bucket_psi.h"
-#include "psi/rr22/common.h"
-#include "psi/rr22/rr22_psi.h"
-#include "psi/rr22/rr22_utils.h"
#include "psi/trace_categories.h"
#include "psi/utils/bucket.h"
#include "psi/utils/sync.h"
diff --git a/psi/rr22/sender.h b/psi/algorithm/rr22/sender.h
similarity index 96%
rename from psi/rr22/sender.h
rename to psi/algorithm/rr22/sender.h
index 6fbb28d..1b3f484 100644
--- a/psi/rr22/sender.h
+++ b/psi/algorithm/rr22/sender.h
@@ -13,8 +13,8 @@
// limitations under the License.
#pragma once
+#include "psi/algorithm/rr22/rr22_psi.h"
#include "psi/interface.h"
-#include "psi/rr22/rr22_psi.h"
#include "psi/utils/hash_bucket_cache.h"
#include "psi/proto/psi_v2.pb.h"
diff --git a/psi/rr22/sparseconfig.h b/psi/algorithm/rr22/sparseconfig.h
similarity index 100%
rename from psi/rr22/sparseconfig.h
rename to psi/algorithm/rr22/sparseconfig.h
diff --git a/psi/sealpir/BUILD.bazel b/psi/algorithm/sealpir/BUILD.bazel
similarity index 85%
rename from psi/sealpir/BUILD.bazel
rename to psi/algorithm/sealpir/BUILD.bazel
index a48675e..b195699 100644
--- a/psi/sealpir/BUILD.bazel
+++ b/psi/algorithm/sealpir/BUILD.bazel
@@ -25,7 +25,7 @@ psi_cc_library(
"-lm",
],
deps = [
- "@com_github_microsoft_seal//:seal",
+ "@seal",
"@yacl//yacl/base:exception",
],
)
@@ -40,10 +40,10 @@ psi_cc_library(
],
deps = [
":seal_pir_utils",
- "//psi/kwpir:kw_pir",
- "//psi/sealpir:serializable_cc_proto",
- "@com_github_microsoft_seal//:seal",
- "@com_github_openssl_openssl//:openssl",
+ ":serializable_cc_proto",
+ "//psi/algorithm/kwpir:kw_pir",
+ "@openssl",
+ "@seal",
"@yacl//yacl/base:byte_container_view",
"@yacl//yacl/base:exception",
"@yacl//yacl/link",
@@ -66,6 +66,6 @@ psi_cc_test(
srcs = ["seal_pir_test.cc"],
deps = [
":seal_pir",
- "@com_github_microsoft_seal//:seal",
+ "@seal",
],
)
diff --git a/psi/sealpir/README.md b/psi/algorithm/sealpir/README.md
similarity index 100%
rename from psi/sealpir/README.md
rename to psi/algorithm/sealpir/README.md
diff --git a/psi/sealpir/seal_pir.cc b/psi/algorithm/sealpir/seal_pir.cc
similarity index 99%
rename from psi/sealpir/seal_pir.cc
rename to psi/algorithm/sealpir/seal_pir.cc
index 3c6a062..3269e76 100644
--- a/psi/sealpir/seal_pir.cc
+++ b/psi/algorithm/sealpir/seal_pir.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/sealpir/seal_pir.h"
+#include "psi/algorithm/sealpir/seal_pir.h"
#include
@@ -1024,4 +1024,4 @@ Ciphertext SealPirClient::GetOne() {
return ct;
}
-} // namespace psi::sealpir
\ No newline at end of file
+} // namespace psi::sealpir
diff --git a/psi/sealpir/seal_pir.h b/psi/algorithm/sealpir/seal_pir.h
similarity index 97%
rename from psi/sealpir/seal_pir.h
rename to psi/algorithm/sealpir/seal_pir.h
index 7ff2a96..eb18f29 100644
--- a/psi/sealpir/seal_pir.h
+++ b/psi/algorithm/sealpir/seal_pir.h
@@ -23,10 +23,10 @@
#include "seal/util/polyarithsmallmod.h"
#include "yacl/base/byte_container_view.h"
-#include "psi/kwpir/index_pir.h"
-#include "psi/sealpir/seal_pir_utils.h"
+#include "psi/algorithm/kwpir/index_pir.h"
+#include "psi/algorithm/sealpir/seal_pir_utils.h"
-#include "psi/sealpir/serializable.pb.h"
+#include "psi/algorithm/sealpir/serializable.pb.h"
namespace psi::sealpir {
@@ -214,4 +214,4 @@ class SealPirClient : public SealPir, public psi::kwpir::IndexPirClient {
friend class SealPirServer;
};
-} // namespace psi::sealpir
\ No newline at end of file
+} // namespace psi::sealpir
diff --git a/psi/sealpir/seal_pir_test.cc b/psi/algorithm/sealpir/seal_pir_test.cc
similarity index 98%
rename from psi/sealpir/seal_pir_test.cc
rename to psi/algorithm/sealpir/seal_pir_test.cc
index c7cf612..a94750f 100644
--- a/psi/sealpir/seal_pir_test.cc
+++ b/psi/algorithm/sealpir/seal_pir_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/sealpir/seal_pir.h"
+#include "psi/algorithm/sealpir/seal_pir.h"
#include
@@ -163,4 +163,4 @@ INSTANTIATE_TEST_SUITE_P(
TestParams{4096, 1 << 18, 10, 0, 2, 20, true},
TestParams{4096, 1000, 288, 100, 2, 20, true},
TestParams{8192, 1000, 288, 0, 2, 20, true}));
-} // namespace psi::sealpir
\ No newline at end of file
+} // namespace psi::sealpir
diff --git a/psi/sealpir/seal_pir_utils.cc b/psi/algorithm/sealpir/seal_pir_utils.cc
similarity index 96%
rename from psi/sealpir/seal_pir_utils.cc
rename to psi/algorithm/sealpir/seal_pir_utils.cc
index b871d0c..0d8ba27 100644
--- a/psi/sealpir/seal_pir_utils.cc
+++ b/psi/algorithm/sealpir/seal_pir_utils.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "psi/sealpir/seal_pir_utils.h"
+#include "psi/algorithm/sealpir/seal_pir_utils.h"
#include "yacl/base/exception.h"
@@ -59,4 +59,4 @@ std::vector MemoryDbPlaintextStore::ReadPlaintexts(
return db_vec_[sub_db_index];
}
-} // namespace psi::sealpir
\ No newline at end of file
+} // namespace psi::sealpir
diff --git a/psi/sealpir/seal_pir_utils.h b/psi/algorithm/sealpir/seal_pir_utils.h
similarity index 100%
rename from psi/sealpir/seal_pir_utils.h
rename to psi/algorithm/sealpir/seal_pir_utils.h
diff --git a/psi/sealpir/serializable.proto b/psi/algorithm/sealpir/serializable.proto
similarity index 100%
rename from psi/sealpir/serializable.proto
rename to psi/algorithm/sealpir/serializable.proto
diff --git a/psi/algorithm/spiral/BUILD.bazel b/psi/algorithm/spiral/BUILD.bazel
new file mode 100644
index 0000000..1056634
--- /dev/null
+++ b/psi/algorithm/spiral/BUILD.bazel
@@ -0,0 +1,172 @@
+# Copyright 2024 Ant Group Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load("//bazel:psi.bzl", "psi_cc_library", "psi_cc_test")
+
+package(default_visibility = ["//visibility:public"])
+
+psi_cc_library(
+ name = "common",
+ hdrs = ["common.h"],
+ deps = [
+ "@abseil-cpp//absl/types:span",
+ ],
+)
+
+psi_cc_library(
+ name = "util",
+ srcs = ["util.cc"],
+ hdrs = ["util.h"],
+ deps = [
+ ":params",
+ ],
+)
+
+psi_cc_library(
+ name = "params",
+ srcs = ["params.cc"],
+ hdrs = ["params.h"],
+ deps = [
+ ":common",
+ "//psi/algorithm/spiral/arith",
+ "//psi/algorithm/spiral/arith:ntt_table",
+ "//psi/algorithm/spiral/arith:number_theory",
+ "@yacl//yacl/base:exception",
+ "@yacl//yacl/base:int128",
+ "@yacl//yacl/crypto/hash:blake3",
+ "@yacl//yacl/utils:elapsed_timer",
+ ],
+)
+
+psi_cc_test(
+ name = "params_test",
+ srcs = ["params_test.cc"],
+ deps = [
+ ":common",
+ ":params",
+ ":util",
+ "//psi/algorithm/spiral/arith:ntt_table",
+ ],
+)
+
+psi_cc_library(
+ name = "poly_matrix",
+ srcs = ["poly_matrix.cc"],
+ hdrs = ["poly_matrix.h"],
+ copts = ["-mavx2"],
+ deps = [
+ ":params",
+ ":util",
+ "//psi/algorithm/spiral/arith:arith_params",
+ "//psi/algorithm/spiral/arith:ntt",
+ "@abseil-cpp//absl/strings",
+ "@abseil-cpp//absl/types:span",
+ "@yacl//yacl/base:aligned_vector",
+ "@yacl//yacl/base:exception",
+ "@yacl//yacl/base:int128",
+ "@yacl//yacl/crypto/rand",
+ "@yacl//yacl/crypto/tools:prg",
+ "@yacl//yacl/utils:parallel",
+ ] + select({
+ "@platforms//cpu:aarch64": [
+ "@sse2neon",
+ ],
+ "//conditions:default": [],
+ }),
+)
+
+psi_cc_library(
+ name = "poly_matrix_utils",
+ srcs = ["poly_matrix_utils.cc"],
+ hdrs = ["poly_matrix_utils.h"],
+ copts = ["-mavx2"],
+ deps = [
+ "poly_matrix",
+ ":params",
+ ":util",
+ "//psi/algorithm/spiral/arith:arith_params",
+ "//psi/algorithm/spiral/arith:ntt",
+ "@abseil-cpp//absl/strings",
+ "@abseil-cpp//absl/types:span",
+ "@yacl//yacl/base:aligned_vector",
+ "@yacl//yacl/base:exception",
+ "@yacl//yacl/base:int128",
+ "@yacl//yacl/crypto/rand",
+ "@yacl//yacl/crypto/tools:prg",
+ "@yacl//yacl/utils:parallel",
+ ],
+)
+
+psi_cc_test(
+ name = "poly_matrix_test",
+ srcs = ["poly_matrix_test.cc"],
+ copts = ["-mavx2"],
+ deps = [
+ "poly_matrix",
+ "poly_matrix_utils",
+ ":common",
+ ":params",
+ ":util",
+ "//psi/algorithm/spiral/arith:ntt_table",
+ "@abseil-cpp//absl/types:span",
+ "@seal",
+ "@yacl//yacl/base:buffer",
+ "@yacl//yacl/utils:elapsed_timer",
+ "@yacl//yacl/utils:parallel",
+ ],
+)
+
+psi_cc_library(
+ name = "discrete_gaussian",
+ srcs = ["discrete_gaussian.cc"],
+ hdrs = ["discrete_gaussian.h"],
+ deps = [
+ ":poly_matrix",
+ "@yacl//yacl/crypto/rand",
+ "@yacl//yacl/crypto/tools:prg",
+ ],
+)
+
+psi_cc_test(
+ name = "discrete_gaussian_test",
+ srcs = ["discrete_gaussian_test.cc"],
+ deps = [
+ ":discrete_gaussian",
+ ":params",
+ ":poly_matrix",
+ ":util",
+ "@yacl//yacl/crypto/rand",
+ "@yacl//yacl/crypto/tools:prg",
+ ],
+)
+
+psi_cc_library(
+ name = "gadget",
+ srcs = ["gadget.cc"],
+ hdrs = ["gadget.h"],
+ deps = [
+ ":params",
+ ":poly_matrix",
+ ],
+)
+
+psi_cc_test(
+ name = "gadget_test",
+ srcs = ["gadget_test.cc"],
+ deps = [
+ ":gadget",
+ ":params",
+ ":util",
+ ],
+)
diff --git a/psi/algorithm/spiral/README.md b/psi/algorithm/spiral/README.md
new file mode 100644
index 0000000..7ed8299
--- /dev/null
+++ b/psi/algorithm/spiral/README.md
@@ -0,0 +1,4 @@
+
+This is a C++ implementation of [Spiral Fast, High Rate Single Server PIR via FHE Composition](https://eprint.iacr.org/2022/368).
+
+We referred to the [Rust implementation](https://github.com/blyssprivacy/sdk/tree/main/lib/spiral-rs) corresponding to this paper.
\ No newline at end of file
diff --git a/psi/algorithm/spiral/arith/BUILD.bazel b/psi/algorithm/spiral/arith/BUILD.bazel
new file mode 100644
index 0000000..c86f328
--- /dev/null
+++ b/psi/algorithm/spiral/arith/BUILD.bazel
@@ -0,0 +1,112 @@
+load("//bazel:psi.bzl", "psi_cc_library", "psi_cc_test")
+
+package(default_visibility = ["//visibility:public"])
+
+psi_cc_library(
+ name = "arith",
+ hdrs = ["arith.h"],
+ deps = [
+ "//psi/algorithm/spiral:common",
+ "@abseil-cpp//absl/strings",
+ "@seal",
+ "@yacl//yacl/base:exception",
+ "@yacl//yacl/base:int128",
+ "@yacl//yacl/math:gadget",
+ ],
+)
+
+psi_cc_library(
+ name = "arith_params",
+ hdrs = ["arith_params.h"],
+ deps = [
+ ":arith",
+ "//psi/algorithm/spiral:common",
+ "//psi/algorithm/spiral:params",
+ "@abseil-cpp//absl/strings",
+ "@seal",
+ "@yacl//yacl/base:exception",
+ "@yacl//yacl/base:int128",
+ ],
+)
+
+psi_cc_library(
+ name = "number_theory",
+ hdrs = ["number_theory.h"],
+ deps = [
+ ":arith",
+ "//psi/algorithm/spiral:common",
+ "@abseil-cpp//absl/strings",
+ "@seal",
+ "@yacl//yacl/base:exception",
+ ],
+)
+
+psi_cc_library(
+ name = "ntt_table",
+ srcs = ["ntt_table.cc"],
+ hdrs = ["ntt_table.h"],
+ deps = [
+ ":arith",
+ ":number_theory",
+ "//psi/algorithm/spiral:common",
+ "@seal",
+ "@yacl//yacl/base:exception",
+ ],
+)
+
+psi_cc_library(
+ name = "ntt",
+ srcs = ["ntt.cc"],
+ hdrs = ["ntt.h"],
+ copts = ["-mavx2"],
+ deps = [
+ ":arith",
+ ":ntt_table",
+ ":number_theory",
+ "//psi/algorithm/spiral:params",
+ "@abseil-cpp//absl/types:span",
+ "@seal",
+ "@yacl//yacl/base:aligned_vector",
+ "@yacl//yacl/base:exception",
+ "@yacl//yacl/utils:parallel",
+ ] + select({
+ "@platforms//cpu:aarch64": [
+ "@sse2neon",
+ ],
+ "//conditions:default": [],
+ }),
+)
+
+psi_cc_test(
+ name = "arith_test",
+ srcs = ["arith_test.cc"],
+ deps = [
+ ":arith",
+ ":arith_params",
+ "//psi/algorithm/spiral:util",
+ "@abseil-cpp//absl/strings",
+ "@seal",
+ ],
+)
+
+psi_cc_test(
+ name = "number_theory_test",
+ srcs = ["number_theory_test.cc"],
+ deps = [
+ ":number_theory",
+ ],
+)
+
+psi_cc_test(
+ name = "ntt_table_test",
+ srcs = ["ntt_table_test.cc"],
+ copts = ["-mavx2"],
+ deps = [
+ ":ntt",
+ ":ntt_table",
+ "//psi/algorithm/spiral:params",
+ "//psi/algorithm/spiral:util",
+ "@abseil-cpp//absl/types:span",
+ "@yacl//yacl/base:aligned_vector",
+ ],
+)
diff --git a/psi/algorithm/spiral/arith/arith.h b/psi/algorithm/spiral/arith/arith.h
new file mode 100644
index 0000000..90bdbc7
--- /dev/null
+++ b/psi/algorithm/spiral/arith/arith.h
@@ -0,0 +1,235 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include
+#include
+#include
+
+#include "absl/strings/numbers.h"
+#include "seal/seal.h"
+#include "seal/util/common.h"
+#include "seal/util/uintarith.h"
+#include "seal/util/uintarithsmallmod.h"
+#include "yacl/base/exception.h"
+#include "yacl/base/int128.h"
+#include "yacl/math/gadget.h"
+
+#include "psi/algorithm/spiral/common.h"
+
+namespace psi::spiral::arith {
+
+inline std::uint64_t Log2(std::uint64_t a) { return yacl::math::Log2Floor(a); }
+
+inline std::uint64_t Log2Ceil(std::uint64_t a) {
+ return yacl::math::Log2Ceil(a);
+}
+
+inline std::pair GetBarrettCrs(
+ std::uint64_t modulus) {
+ // represent 2^{128}
+ std::array numerator{0, 0, 1};
+ std::array quotient{0, 0, 0};
+ seal::util::divide_uint192_inplace(numerator.data(), modulus,
+ quotient.data());
+ // barrett redeuce precomputation
+ return std::make_pair(quotient[0], quotient[1]);
+}
+
+inline std::pair,
+ std::array>
+GetBarrett(const std::vector& moduli) {
+ std::array cr0{0, 0, 0, 0};
+ std::array cr1{0, 0, 0, 0};
+
+ for (std::size_t i = 0; i < moduli.size(); ++i) {
+ std::pair crs = GetBarrettCrs(moduli[i]);
+ cr0[i] = crs.first;
+ cr1[i] = crs.second;
+ }
+ return std::make_pair(cr0, cr1);
+}
+
+inline std::uint64_t ExponentitateUintMod(std::uint64_t operand,
+ std::uint64_t exponent,
+ std::uint64_t modulus) {
+ seal::Modulus mod(modulus);
+ return seal::util::exponentiate_uint_mod(operand, exponent, mod);
+}
+
+inline std::uint64_t ExponentitateUintMod(std::uint64_t operand,
+ std::uint64_t exponent,
+ const seal::Modulus& mod) {
+ return seal::util::exponentiate_uint_mod(operand, exponent, mod);
+}
+
+inline std::uint64_t ReverseBits(std::uint64_t x, std::uint64_t bit_count) {
+ if (bit_count == 0) {
+ return 0;
+ }
+ return seal::util::reverse_bits(x, bit_count);
+}
+
+inline std::uint64_t Div2UintMod(std::uint64_t operand, std::uint64_t modulus) {
+ seal::Modulus mod(modulus);
+ return seal::util::div2_uint_mod(operand, mod);
+}
+
+inline std::uint64_t Div2UintMod(std::uint64_t operand,
+ const seal::Modulus& mod) {
+ return seal::util::div2_uint_mod(operand, mod);
+}
+
+inline std::uint64_t Recenter(std::uint64_t val, std::uint64_t from_modulus,
+ std::uint64_t to_modulus) {
+ YACL_ENFORCE(from_modulus >= to_modulus);
+
+ auto from_modulus_i64 = static_cast(from_modulus);
+ auto to_modulus_i64 = static_cast(to_modulus);
+ auto a_val = static_cast(val);
+
+ if (val >= from_modulus / 2) {
+ a_val -= from_modulus_i64;
+ }
+
+ a_val = a_val + (from_modulus_i64 / to_modulus_i64) * to_modulus_i64 +
+ 2 * to_modulus_i64;
+ a_val %= to_modulus_i64;
+
+ return static_cast(a_val);
+}
+
+inline std::uint64_t BarrettRawU64(std::uint64_t input,
+ std::uint64_t const_ratio_1,
+ std::uint64_t modulus) {
+ std::uint64_t tmp = 0ULL;
+ seal::util::multiply_uint64_hw64(input, const_ratio_1,
+ reinterpret_cast(&tmp));
+
+ std::uint64_t res = input - (tmp * modulus);
+
+ return res >= modulus ? res - modulus : res;
+}
+
+inline std::uint64_t BarrettRawU128(uint128_t val, std::uint64_t cr0,
+ std::uint64_t cr1, std::uint64_t modulus) {
+ auto [h64, l64] = yacl::DecomposeUInt128(val);
+
+ std::uint64_t tmp1 = 0ULL;
+ std::uint64_t tmp3 = 0ULL;
+ // seal api need unsigned long long type
+ unsigned long long carry = 0ULL;
+ // std::array tmp2 = {0ULL, 0ULL};
+ unsigned long long tmp2[2]{0ULL, 0ULL};
+ // Round 1
+ // (x0 * m0)_1 , 即 x0 * m0 的高 64 bits
+ seal::util::multiply_uint64_hw64(l64, cr0, &carry);
+ // tmp2 = [(x0 * m1)_0, (x0 * m1)_1]
+ seal::util::multiply_uint64(l64, cr1, tmp2);
+
+ tmp3 = tmp2[1] + seal::util::add_uint64(tmp2[0], carry, &tmp1);
+
+ // Round2
+ seal::util::multiply_uint64(h64, cr0, tmp2);
+ carry = tmp2[1] + seal::util::add_uint64(tmp1, tmp2[0], &tmp1);
+ // This is all we care about
+ tmp1 = h64 * cr1 + tmp3 + carry;
+
+ // reduction
+ tmp3 = l64 - tmp1 * modulus;
+ // this is a lazy result \in [0, 2*modulus)
+ return tmp3;
+}
+
+inline std::uint64_t BarrettReductionU128Raw(uint128_t val, std::uint64_t cr0,
+ std::uint64_t cr1,
+ std::uint64_t modulus) {
+ std::uint64_t reduced_val = BarrettRawU128(val, cr0, cr1, modulus);
+ reduced_val -= (modulus) * static_cast(reduced_val >= modulus);
+ return reduced_val;
+}
+
+inline std::uint64_t RecenertMod(std::uint64_t val, std::uint64_t small_modulus,
+ std::uint64_t large_modulus) {
+ YACL_ENFORCE_LT(val, small_modulus);
+
+ auto val_i64 = static_cast(val);
+ auto small_modulus_i64 = static_cast(small_modulus);
+ auto large_modulus_i64 = static_cast(large_modulus);
+
+ if (val_i64 > (small_modulus_i64 / 2)) {
+ val_i64 -= small_modulus_i64;
+ }
+ if (val_i64 < 0) {
+ val_i64 += large_modulus_i64;
+ }
+ return static_cast(val_i64);
+}
+
+inline std::uint64_t Rescale(std::uint64_t a, std::uint64_t in_mod,
+ std::uint64_t out_mod) {
+ auto in_mod_i64 = static_cast(in_mod);
+ int128_t in_mod_i128 = yacl::MakeInt128(0, in_mod);
+ int128_t out_mod_i128 = yacl::MakeInt128(0, out_mod);
+
+ auto in_val = static_cast(a % in_mod);
+ if (in_val >= (in_mod_i64 / 2)) {
+ in_val -= in_mod_i64;
+ }
+ std::int64_t sign = (in_val >= 0) ? 1 : -1;
+ // int64_t can directly mul int128_t
+ // do need to firstly convert to
+ int128_t val = in_val * out_mod_i128;
+
+ // val + int64_t = int128_t + int64_t, this is ok
+ int128_t result = (val + sign * (in_mod_i64 / 2)) / in_mod_i128;
+
+ // if the low-64 bit's type is int64_t, you must be carefully use MakeInt128
+ int128_t tmp = yacl::MakeInt128(0, (in_mod / out_mod) * out_mod);
+ result = (result + tmp + (2 * out_mod_i128)) % out_mod_i128;
+
+ YACL_ENFORCE(result >= 0);
+
+ result = (result + out_mod_i128) % out_mod_i128;
+ auto last_result = yacl::DecomposeInt128(result).second;
+
+ return last_result;
+}
+
+inline std::uint64_t MultiplyUintMod(std::uint64_t a, std::uint64_t b,
+ std::uint64_t modulus) {
+ seal::Modulus mod(modulus);
+ return seal::util::multiply_uint_mod(a, b, mod);
+}
+
+inline std::uint64_t MultiplyUintMod(std::uint64_t a, std::uint64_t b,
+ const seal::Modulus& mod) {
+ return seal::util::multiply_uint_mod(a, b, mod);
+}
+
+inline std::uint64_t MultiplyUintMod(std::uint64_t a, std::uint64_t b,
+ std::uint64_t modulus,
+ uint64_t barrett_cr0,
+ uint64_t barrett_cr1) {
+ unsigned long long z[2] = {0ULL, 0ULL};
+ seal::util::multiply_uint64(a, b, z);
+ uint128_t z128 = yacl::MakeUint128(z[1], z[0]);
+ return BarrettReductionU128Raw(z128, barrett_cr0, barrett_cr1, modulus);
+}
+
+inline size_t UintNum(size_t len, size_t uint_len) {
+ return (len + uint_len - 1) / uint_len;
+}
+
+} // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/arith/arith_params.h b/psi/algorithm/spiral/arith/arith_params.h
new file mode 100644
index 0000000..cbbf6c7
--- /dev/null
+++ b/psi/algorithm/spiral/arith/arith_params.h
@@ -0,0 +1,66 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include
+
+#include "yacl/base/int128.h"
+
+#include "psi/algorithm/spiral/arith/arith.h"
+#include "psi/algorithm/spiral/params.h"
+
+namespace psi::spiral::arith {
+
+inline std::uint64_t BarrettU64(const Params& params, std::uint64_t val) {
+ return BarrettRawU64(val, params.BarrettCr1Modulus(), params.Modulus());
+}
+
+inline std::uint64_t BarrettCoeffU64(const Params& params, std::uint64_t val,
+ std::size_t moduli_idx) {
+ return BarrettRawU64(val, params.BarrettCr1(moduli_idx),
+ params.Moduli(moduli_idx));
+}
+
+inline std::uint64_t BarrettReductionU128(const Params& params, uint128_t val) {
+ return BarrettReductionU128Raw(val, params.BarrettCr0Modulus(),
+ params.BarrettCr1Modulus(), params.Modulus());
+}
+
+inline std::uint64_t MultiplyModular(const Params& params, std::uint64_t a,
+ std::uint64_t b, std::size_t moduli_idx) {
+ return BarrettCoeffU64(params, a * b, moduli_idx);
+}
+
+inline std::uint64_t MultiplyAddModular(const Params& params, std::uint64_t a,
+ std::uint64_t b, std::uint64_t x,
+ std::size_t moduli_idx) {
+ return BarrettCoeffU64(params, a * b + x, moduli_idx);
+}
+
+inline std::uint64_t AddModular(const Params& params, std::uint64_t a,
+ std::uint64_t b, std::size_t moduli_idx) {
+ return BarrettCoeffU64(params, a + b, moduli_idx);
+}
+
+inline std::uint64_t InvertModular(const Params& params, std::uint64_t a,
+ std::size_t moduli_idx) {
+ return params.Moduli(moduli_idx) - a;
+}
+
+inline std::uint64_t ModularReduce(const Params& params, std::uint64_t a,
+ std::size_t moduli_idx) {
+ return BarrettCoeffU64(params, a, moduli_idx);
+}
+
+} // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/arith/arith_test.cc b/psi/algorithm/spiral/arith/arith_test.cc
new file mode 100644
index 0000000..af7cbe2
--- /dev/null
+++ b/psi/algorithm/spiral/arith/arith_test.cc
@@ -0,0 +1,227 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/arith/arith.h"
+
+#include
+#include
+#include
+
+#include "gtest/gtest.h"
+
+#include "psi/algorithm/spiral/arith/arith_params.h"
+#include "psi/algorithm/spiral/util.h"
+
+namespace psi::spiral::arith {
+
+namespace {
+constexpr std::size_t kMaxLoop = 1000;
+}
+
+TEST(ArithTest, MultiplyUintMod) {
+ std::uint64_t mod{2};
+ ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 0, mod));
+ ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 1, mod));
+ ASSERT_EQ(0ULL, arith::MultiplyUintMod(1, 0, mod));
+ ASSERT_EQ(1ULL, arith::MultiplyUintMod(1, 1, mod));
+
+ auto [cr0, cr1] = arith::GetBarrettCrs(mod);
+ ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 0, mod, cr0, cr1));
+ ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 1, mod, cr0, cr1));
+ ASSERT_EQ(0ULL, arith::MultiplyUintMod(1, 0, mod, cr0, cr1));
+ ASSERT_EQ(1ULL, arith::MultiplyUintMod(1, 1, mod, cr0, cr1));
+
+ mod = 10;
+ ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 0, mod));
+ ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 1, mod));
+ ASSERT_EQ(0ULL, arith::MultiplyUintMod(1, 0, mod));
+ ASSERT_EQ(1ULL, arith::MultiplyUintMod(1, 1, mod));
+ ASSERT_EQ(9ULL, arith::MultiplyUintMod(7, 7, mod));
+ ASSERT_EQ(2ULL, arith::MultiplyUintMod(6, 7, mod));
+ ASSERT_EQ(2ULL, arith::MultiplyUintMod(7, 6, mod));
+
+ mod = 2305843009211596801ULL;
+ ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 0, mod));
+ ASSERT_EQ(0ULL, arith::MultiplyUintMod(0, 1, mod));
+ ASSERT_EQ(0ULL, arith::MultiplyUintMod(1, 0, mod));
+ ASSERT_EQ(1ULL, arith::MultiplyUintMod(1, 1, mod));
+ ASSERT_EQ(576460752302899200ULL,
+ arith::MultiplyUintMod(1152921504605798400ULL,
+ 1152921504605798401ULL, mod));
+ ASSERT_EQ(576460752302899200ULL,
+ arith::MultiplyUintMod(1152921504605798401ULL,
+ 1152921504605798400ULL, mod));
+ ASSERT_EQ(1729382256908697601ULL,
+ arith::MultiplyUintMod(1152921504605798401ULL,
+ 1152921504605798401ULL, mod));
+ ASSERT_EQ(1ULL, arith::MultiplyUintMod(2305843009211596800ULL,
+ 2305843009211596800ULL, mod));
+
+ auto [cr00, cr11] = arith::GetBarrettCrs(mod);
+ ASSERT_EQ(576460752302899200ULL,
+ arith::MultiplyUintMod(1152921504605798400ULL,
+ 1152921504605798401ULL, mod, cr00, cr11));
+ ASSERT_EQ(576460752302899200ULL,
+ arith::MultiplyUintMod(1152921504605798401ULL,
+ 1152921504605798400ULL, mod, cr00, cr11));
+ ASSERT_EQ(1729382256908697601ULL,
+ arith::MultiplyUintMod(1152921504605798401ULL,
+ 1152921504605798401ULL, mod, cr00, cr11));
+ ASSERT_EQ(1ULL,
+ arith::MultiplyUintMod(2305843009211596800ULL,
+ 2305843009211596800ULL, mod, cr00, cr11));
+}
+
+TEST(ArithTest, ReverseBits) {
+ ASSERT_EQ(0ULL, arith::ReverseBits(0ULL, 0));
+ ASSERT_EQ(0ULL, arith::ReverseBits(0ULL, 1));
+ ASSERT_EQ(0ULL, arith::ReverseBits(0ULL, 32));
+ ASSERT_EQ(0ULL, arith::ReverseBits(0ULL, 64));
+
+ ASSERT_EQ(0ULL, arith::ReverseBits(1ULL, 0));
+ ASSERT_EQ(1ULL, arith::ReverseBits(1ULL, 1));
+ ASSERT_EQ(1ULL << 31, arith::ReverseBits(1ULL, 32));
+ ASSERT_EQ(1ULL << 63, arith::ReverseBits(1ULL, 64));
+
+ ASSERT_EQ(0ULL, arith::ReverseBits(1ULL << 31, 0));
+ ASSERT_EQ(0ULL, arith::ReverseBits(1ULL << 31, 1));
+ ASSERT_EQ(1ULL, arith::ReverseBits(1ULL << 31, 32));
+ ASSERT_EQ(1ULL << 32, arith::ReverseBits(1ULL << 31, 64));
+
+ ASSERT_EQ(0ULL, arith::ReverseBits(0xFFFFULL << 16, 0));
+ ASSERT_EQ(0ULL, arith::ReverseBits(0xFFFFULL << 16, 1));
+ ASSERT_EQ(0xFFFFULL, arith::ReverseBits(0xFFFFULL << 16, 32));
+ ASSERT_EQ(0xFFFFULL << 32, arith::ReverseBits(0xFFFFULL << 16, 64));
+
+ ASSERT_EQ(0ULL, arith::ReverseBits(0x0000FFFFFFFF0000ULL, 0));
+ ASSERT_EQ(0ULL, arith::ReverseBits(0x0000FFFFFFFF0000ULL, 1));
+ ASSERT_EQ(0xFFFFULL, arith::ReverseBits(0x0000FFFFFFFF0000ULL, 32));
+ ASSERT_EQ(0x0000FFFFFFFF0000ULL,
+ arith::ReverseBits(0x0000FFFFFFFF0000ULL, 64));
+
+ ASSERT_EQ(0ULL, arith::ReverseBits(0xFFFF0000FFFF0000ULL, 0));
+ ASSERT_EQ(0ULL, arith::ReverseBits(0xFFFF0000FFFF0000ULL, 1));
+ ASSERT_EQ(0xFFFFULL, arith::ReverseBits(0xFFFF0000FFFF0000ULL, 32));
+ ASSERT_EQ(0x0000FFFF0000FFFFULL,
+ arith::ReverseBits(0xFFFF0000FFFF0000ULL, 64));
+}
+
+TEST(ArithTest, BarrettRawU64) {
+ std::uint64_t mod{10};
+ auto const_ratio = arith::GetBarrettCrs(mod);
+
+ ASSERT_EQ(0, arith::BarrettRawU64(0, const_ratio.second, mod));
+ ASSERT_EQ(1, arith::BarrettRawU64(1, const_ratio.second, mod));
+ ASSERT_EQ(8, arith::BarrettRawU64(8, const_ratio.second, mod));
+ ASSERT_EQ(7, arith::BarrettRawU64(1234567, const_ratio.second, mod));
+ ASSERT_EQ(0, arith::BarrettRawU64(12345670, const_ratio.second, mod));
+
+ mod = 66974689739603969ULL;
+ std::uint64_t cr1 = 275ULL;
+
+ std::random_device rd;
+ std::mt19937 rng(rd());
+
+ for (std::size_t i = 0; i < kMaxLoop; ++i) {
+ std::uint64_t val = rng();
+ ASSERT_EQ(val % mod, arith::BarrettRawU64(val, cr1, mod));
+ }
+}
+
+TEST(ArithTest, Div2UintMod) { ASSERT_EQ(5, arith::Div2UintMod(3, 7)); }
+
+TEST(ArithTest, GetBarrettCrs) {
+ std::pair expected =
+ std::make_pair(16144578669088582089ULL, 68736257792ULL);
+ ASSERT_EQ(expected, arith::GetBarrettCrs(268369921ULL));
+
+ expected = std::make_pair(10966983149909726427ULL, 73916747789ULL);
+ ASSERT_EQ(expected, arith::GetBarrettCrs(249561089ULL));
+
+ expected = std::make_pair(7906011006380390721ULL, 275ULL);
+ ASSERT_EQ(expected, arith::GetBarrettCrs(66974689739603969ULL));
+}
+TEST(ArithTest, BarrettReductionU128Raw) {
+ std::uint64_t modulus = 66974689739603969ULL;
+ uint128_t modulus_u128 = yacl::MakeUint128(0ULL, modulus);
+
+ std::function exec = [](uint128_t val) {
+ return BarrettReductionU128Raw(val, 7906011006380390721ULL, 275ULL,
+ 66974689739603969ULL);
+ };
+
+ ASSERT_EQ(0, exec(modulus_u128));
+ ASSERT_EQ(1, exec(modulus_u128 + 1));
+ ASSERT_EQ(5, exec((modulus_u128 * 7) + 5));
+
+ std::random_device rd;
+ std::mt19937 rng(rd());
+ for (std::size_t i = 0; i < kMaxLoop; ++i) {
+ std::uint64_t val = rng();
+ uint128_t val_u128 = yacl::MakeUint128(0ULL, val);
+ ASSERT_EQ(val % modulus, exec(val_u128));
+ }
+ // compare with seal::util::barrett_reduce_128
+ modulus = 13131313131313ULL;
+ auto const_ratio = GetBarrettCrs(modulus);
+ seal::Modulus mod(modulus);
+
+ ASSERT_EQ(const_ratio.first, mod.const_ratio()[0]);
+ ASSERT_EQ(const_ratio.second, mod.const_ratio()[1]);
+
+ uint128_t val = yacl::MakeUint128(0, 0);
+ ASSERT_EQ(0, BarrettReductionU128Raw(val, const_ratio.first,
+ const_ratio.second, modulus));
+
+ val = yacl::MakeUint128(0, 1);
+ ASSERT_EQ(1, BarrettReductionU128Raw(val, const_ratio.first,
+ const_ratio.second, modulus));
+
+ val = yacl::MakeUint128(456, 123);
+ ASSERT_EQ(8722750765283ULL,
+ BarrettReductionU128Raw(val, const_ratio.first, const_ratio.second,
+ modulus));
+
+ val = yacl::MakeUint128(79797979797979, 24242424242424);
+ ASSERT_EQ(1010101010101ULL,
+ BarrettReductionU128Raw(val, const_ratio.first, const_ratio.second,
+ modulus));
+}
+
+TEST(ArithTest, Rescale) {
+ ASSERT_EQ(4, Rescale(3, 17, 21));
+ ASSERT_EQ(2, Rescale(3, 21, 17));
+ ASSERT_EQ(1, Rescale(1, 17, 21));
+
+ std::uint64_t in_mod = 0x7fffffd8001ULL;
+ std::uint64_t out_mod = 0x7fffffc8001ULL;
+
+ EXPECT_EQ(Rescale(2721421219ULL, in_mod, out_mod), 2721421199ULL);
+ EXPECT_EQ(Rescale(2093223862ULL, in_mod, out_mod), 2093223846ULL);
+ EXPECT_EQ(Rescale(3304378079ULL, in_mod, out_mod), 3304378054ULL);
+ EXPECT_EQ(Rescale(3286543357ULL, in_mod, out_mod), 3286543333ULL);
+ EXPECT_EQ(Rescale(1506336168ULL, in_mod, out_mod), 1506336157ULL);
+ EXPECT_EQ(Rescale(3294507908ULL, in_mod, out_mod), 3294507883ULL);
+ EXPECT_EQ(Rescale(3602954393ULL, in_mod, out_mod), 3602954366ULL);
+ EXPECT_EQ(Rescale(3268316190ULL, in_mod, out_mod), 3268316166ULL);
+ EXPECT_EQ(Rescale(3730398221ULL, in_mod, out_mod), 3730398193ULL);
+ EXPECT_EQ(Rescale(3537330165ULL, in_mod, out_mod), 3537330139ULL);
+
+ std::uint64_t modulus = 66974689739603969ULL;
+ std::uint64_t pt_modulus = 256;
+ std::uint64_t in = 34795444278750647ULL;
+ EXPECT_EQ(133, Rescale(in, modulus, pt_modulus));
+}
+
+} // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/arith/ntt.cc b/psi/algorithm/spiral/arith/ntt.cc
new file mode 100644
index 0000000..9acb795
--- /dev/null
+++ b/psi/algorithm/spiral/arith/ntt.cc
@@ -0,0 +1,373 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/arith/ntt.h"
+
+#ifdef __x86_64__
+#include
+#elif defined(__aarch64__)
+#include "sse2neon.h"
+#endif
+
+#include
+#include
+#include
+
+#include "absl/types/span.h"
+#include "spdlog/spdlog.h"
+#include "yacl/base/aligned_vector.h"
+#include "yacl/base/exception.h"
+#include "yacl/utils/parallel.h"
+
+#include "psi/algorithm/spiral/arith/number_theory.h"
+
+namespace psi::spiral::arith {
+
+#ifndef __AVX2__
+
+void NttForward(const Params& params, absl::Span operand_overall) {
+ std::size_t log_n = params.PolyLenLog2();
+ std::size_t n = static_cast(1) << log_n;
+
+ for (std::size_t coeff_mod = 0; coeff_mod < params.CrtCount(); ++coeff_mod) {
+ auto operand = operand_overall.subspan(coeff_mod * n, n);
+
+ const auto& forward_table = params.GetNttForwardTable(coeff_mod);
+ const auto& forward_table_prime = params.GetNttForwardPrimeTable(coeff_mod);
+ // why need to convert uint32
+ auto modulus_small = static_cast(params.Moduli(coeff_mod));
+ std::uint32_t two_times_modulus_small = 2 * modulus_small;
+
+ for (std::size_t mm = 0; mm < log_n; ++mm) {
+ std::size_t m = 1 << mm;
+ std::size_t t = n >> (mm + 1);
+
+ for (std::size_t i = 0; i < m; ++i) {
+ uint64_t w = forward_table[m + i];
+ uint64_t w_prime = forward_table_prime[m + i];
+
+ auto op = operand.subspan(i * (2 * t), 2 * t);
+
+ for (std::size_t j = 0; j < t; ++j) {
+ std::uint32_t x = static_cast(op[j]);
+ std::uint32_t y = static_cast(op[t + j]);
+
+ std::uint32_t curr_x =
+ x - (two_times_modulus_small *
+ static_cast(x >= two_times_modulus_small));
+ std::uint64_t q_tmp = (static_cast(y) *
+ static_cast(w_prime)) >>
+ 32;
+ std::uint64_t q_new =
+ w * static_cast(y) -
+ q_tmp * static_cast(modulus_small);
+
+ op[j] = curr_x + q_new;
+ op[t + j] =
+ curr_x + (static_cast(two_times_modulus_small) - q_new);
+ }
+ }
+
+ // Update the operand with modulus constraints
+ for (std::size_t i = 0; i < n; ++i) {
+ operand[i] -=
+ static_cast(operand[i] >= two_times_modulus_small) *
+ two_times_modulus_small;
+ operand[i] -= static_cast(operand[i] >= modulus_small) *
+ modulus_small;
+ }
+ }
+ }
+}
+
+// AVX2 version of ntt_forward
+#else
+
+void NttForward(const Params& params, absl::Span operand_overall) {
+ SPDLOG_DEBUG("using AVX2 NttForward");
+
+ std::size_t log_n = params.PolyLenLog2();
+ std::size_t n = static_cast(1) << log_n;
+
+ YACL_ENFORCE(operand_overall.size() >= params.CrtCount() * n);
+
+ for (std::size_t coeff_mod = 0; coeff_mod < params.CrtCount(); ++coeff_mod) {
+ auto operand = operand_overall.subspan(coeff_mod * n, n);
+
+ const auto& forward_table = params.GetNttForwardTable(coeff_mod);
+ const auto& forward_table_prime = params.GetNttForwardPrimeTable(coeff_mod);
+ auto modulus_small = static_cast(params.Moduli(coeff_mod));
+ std::uint32_t two_times_modulus_small = 2 * modulus_small;
+
+ for (std::size_t mm = 0; mm < log_n; ++mm) {
+ std::size_t m = 1 << mm;
+ std::size_t t = n >> (mm + 1);
+
+ for (std::size_t i = 0; i < m; ++i) {
+ uint64_t w = forward_table[m + i];
+ uint64_t w_prime = forward_table_prime[m + i];
+
+ auto op = operand.subspan(i * (2 * t), 2 * t);
+
+ SPDLOG_DEBUG("Processing coeff_mod: {}, m: {}, i: {}", coeff_mod, m, i);
+
+ if (t < 4) {
+ for (std::size_t j = 0; j < t; ++j) {
+ uint32_t x = static_cast(op[j]);
+ uint32_t y = static_cast(op[t + j]);
+
+ std::uint32_t curr_x =
+ x - (two_times_modulus_small *
+ static_cast(x >= two_times_modulus_small));
+ std::uint64_t q_tmp = (static_cast(y) *
+ static_cast(w_prime)) >>
+ 32;
+ std::uint64_t q_new =
+ w * static_cast(y) -
+ q_tmp * static_cast(modulus_small);
+
+ op[j] = curr_x + q_new;
+ op[t + j] =
+ curr_x +
+ (static_cast(two_times_modulus_small) - q_new);
+ }
+ } else {
+ for (std::size_t j = 0; j < t; j += 4) {
+ if (j + 4 > t) break; // Ensure we do not exceed bounds
+
+ __m256i* p_x = reinterpret_cast<__m256i*>(&op[j]);
+ __m256i* p_y = reinterpret_cast<__m256i*>(&op[j + t]);
+
+ __m256i x = _mm256_loadu_si256(p_x);
+ __m256i y = _mm256_loadu_si256(p_y);
+
+ __m256i cmp_val = _mm256_set1_epi64x(
+ static_cast(two_times_modulus_small));
+ // reuse this variable to reduce variable num
+ // gt_mask
+ __m256i tmp1 = _mm256_cmpgt_epi64(x, cmp_val);
+
+ // __m256i to_subtract = _mm256_and_si256(gt_mask_reused, cmp_val);
+ tmp1 = _mm256_and_si256(tmp1, cmp_val);
+ __m256i curr_x = _mm256_sub_epi64(x, tmp1);
+
+ // __m256i w_prime_vec =
+ // _mm256_set1_epi64x(static_cast(w_prime));
+ tmp1 = _mm256_set1_epi64x(static_cast(w_prime));
+ // __m256i product = _mm256_mul_epu32(y, tmp1);
+ tmp1 = _mm256_mul_epu32(y, tmp1);
+ // __m256i q_val = _mm256_srli_epi64(tmp1, 32);
+ tmp1 = _mm256_srli_epi64(tmp1, 32);
+
+ // __m256i w_vec = _mm256_set1_epi64x(static_cast(w));
+ __m256i tmp2 = _mm256_set1_epi64x(static_cast(w));
+ // __m256i w_times_y = _mm256_mul_epu32(y, w_vec);
+ // __m256i w_times_y = _mm256_mul_epu32(y, tmp2);
+ tmp2 = _mm256_mul_epu32(y, tmp2);
+
+ __m256i modulus_small_vec =
+ _mm256_set1_epi64x(static_cast(modulus_small));
+ // __m256i q_scaled = _mm256_mul_epu32(q_val, modulus_small_vec);
+ __m256i q_scaled = _mm256_mul_epu32(tmp1, modulus_small_vec);
+ __m256i q_final = _mm256_sub_epi64(tmp2, q_scaled);
+
+ __m256i new_x = _mm256_add_epi64(curr_x, q_final);
+ __m256i q_final_inverted = _mm256_sub_epi64(cmp_val, q_final);
+ __m256i new_y = _mm256_add_epi64(curr_x, q_final_inverted);
+
+ _mm256_storeu_si256(p_x, new_x);
+ _mm256_storeu_si256(p_y, new_y);
+ }
+ }
+ }
+ }
+
+ for (std::size_t i = 0; i < n; i += 4) {
+ if (i + 4 > n) break; // Ensure we do not exceed bounds
+ __m256i* p_x = reinterpret_cast<__m256i*>(&operand[i]);
+
+ __m256i cmp_val1 =
+ _mm256_set1_epi64x(static_cast(two_times_modulus_small));
+ __m256i x = _mm256_loadu_si256(p_x);
+ __m256i gt_mask = _mm256_cmpgt_epi64(x, cmp_val1);
+ __m256i to_subtract = _mm256_and_si256(gt_mask, cmp_val1);
+ x = _mm256_sub_epi64(x, to_subtract);
+
+ __m256i cmp_val2 =
+ _mm256_set1_epi64x(static_cast(modulus_small));
+ gt_mask = _mm256_cmpgt_epi64(x, cmp_val2);
+ to_subtract = _mm256_and_si256(gt_mask, cmp_val2);
+ x = _mm256_sub_epi64(x, to_subtract);
+ _mm256_storeu_si256(p_x, x);
+ }
+ }
+}
+#endif
+
+#ifndef __AVX2__
+void NttInverse(const Params& params, absl::Span operand_overall) {
+ for (std::size_t coeff_mod = 0; coeff_mod < params.CrtCount(); ++coeff_mod) {
+ std::size_t n = params.PolyLen();
+ auto operand = operand_overall.subspan(coeff_mod * n, n);
+
+ const auto& inverse_table = params.GetNttInverseTable(coeff_mod);
+ const auto& inverse_table_prime = params.GetNttInversePrimeTable(coeff_mod);
+ std::uint64_t modulus = params.Moduli(coeff_mod);
+ std::uint64_t two_times_modulus = 2 * modulus;
+
+ for (std::size_t mm = params.PolyLenLog2(); mm-- > 0;) {
+ std::size_t h = 1 << mm;
+ std::size_t t = n >> (mm + 1);
+
+ for (std::size_t i = 0; i < h; ++i) {
+ uint64_t w = inverse_table[h + i];
+ uint64_t w_prime = inverse_table_prime[h + i];
+
+ auto op = operand.subspan(i * 2 * t, 2 * t);
+
+ for (size_t j = 0; j < t; ++j) {
+ uint64_t x = op[j];
+ uint64_t y = op[t + j];
+
+ uint64_t t_tmp = two_times_modulus - y + x;
+ uint64_t curr_x =
+ x + y -
+ (two_times_modulus * static_cast((x << 1) >= t_tmp));
+ uint64_t h_tmp = (t_tmp * w_prime) >> 32;
+
+ uint64_t res_x = (curr_x + (modulus * (t_tmp & 1))) >> 1;
+ uint64_t res_y = w * t_tmp - h_tmp * modulus;
+
+ op[j] = res_x;
+ op[t + j] = res_y;
+ }
+ }
+ }
+
+ for (size_t i = 0; i < n; ++i) {
+ operand[i] -= static_cast(operand[i] >= two_times_modulus) *
+ two_times_modulus;
+ operand[i] -= static_cast(operand[i] >= modulus) * modulus;
+ }
+ }
+}
+
+#else
+
+void NttInverse(const Params& params, absl::Span operand_overall) {
+ SPDLOG_DEBUG("use AVX2 NttInverse");
+
+ for (size_t coeff_mod = 0; coeff_mod < params.CrtCount(); ++coeff_mod) {
+ size_t n = params.PolyLen();
+ auto operand = operand_overall.subspan(coeff_mod * n, n);
+
+ const auto& inverse_table = params.GetNttInverseTable(coeff_mod);
+ const auto& inverse_table_prime = params.GetNttInversePrimeTable(coeff_mod);
+ uint64_t modulus = params.Moduli(coeff_mod);
+ uint64_t two_times_modulus = 2 * modulus;
+
+ for (size_t mm = params.PolyLenLog2(); mm-- > 0;) {
+ size_t h = 1 << mm;
+ size_t t = n >> (mm + 1);
+
+ for (size_t i = 0; i < h; ++i) {
+ uint64_t w = inverse_table[h + i];
+ uint64_t w_prime = inverse_table_prime[h + i];
+
+ auto op = operand.subspan(i * 2 * t, 2 * t); // 获取当前的操作段
+ if (op.size() < 2 * t) {
+ throw std::runtime_error("Operation span is too small.");
+ }
+
+ if (t < 4) {
+ for (size_t j = 0; j < t; ++j) {
+ uint64_t x = op[j];
+ uint64_t y = op[t + j];
+
+ uint64_t t_tmp = two_times_modulus - y + x;
+ uint64_t curr_x = x + y - (two_times_modulus * ((x << 1) >= t_tmp));
+ uint64_t h_tmp = (t_tmp * w_prime) >> 32;
+
+ uint64_t res_x = (curr_x + (modulus * (t_tmp & 1))) >> 1;
+ uint64_t res_y = w * t_tmp - h_tmp * modulus;
+
+ op[j] = res_x;
+ op[t + j] = res_y;
+ }
+ } else {
+ for (size_t j = 0; j < t; j += 4) {
+ __m256i x =
+ _mm256_loadu_si256(reinterpret_cast(&op[j]));
+ __m256i y = _mm256_loadu_si256(
+ reinterpret_cast(&op[j + t]));
+
+ __m256i modulus_vec =
+ _mm256_set1_epi64x(static_cast(modulus));
+ __m256i two_times_modulus_vec =
+ _mm256_set1_epi64x(static_cast(two_times_modulus));
+ __m256i t_tmp = _mm256_sub_epi64(two_times_modulus_vec, y);
+ t_tmp = _mm256_add_epi64(t_tmp, x);
+
+ // __m256i gt_mask =
+ // _mm256_cmpgt_epi64(_mm256_slli_epi64(x, 1), t_tmp);
+ __m256i tmp1 = _mm256_cmpgt_epi64(_mm256_slli_epi64(x, 1), t_tmp);
+ // __m256i to_subtract =
+ // _mm256_and_si256(tmp1, two_times_modulus_vec);
+
+ tmp1 = _mm256_and_si256(tmp1, two_times_modulus_vec);
+
+ __m256i curr_x = _mm256_add_epi64(x, y);
+ curr_x = _mm256_sub_epi64(curr_x, tmp1);
+
+ // __m256i w_prime_vec =
+ // _mm256_set1_epi64x(static_cast(w_prime));
+ tmp1 = _mm256_set1_epi64x(static_cast(w_prime));
+ __m256i h_tmp = _mm256_mul_epu32(t_tmp, tmp1);
+ h_tmp = _mm256_srli_epi64(h_tmp, 32);
+
+ // __m256i and_mask = _mm256_set1_epi64x(1);
+ tmp1 = _mm256_set1_epi64x(1);
+ __m256i eq_mask =
+ _mm256_cmpeq_epi64(_mm256_and_si256(t_tmp, tmp1), tmp1);
+ // __m256i to_add = _mm256_and_si256(eq_mask, modulus_vec);
+ tmp1 = _mm256_and_si256(eq_mask, modulus_vec);
+
+ // __m256i new_x =
+ // _mm256_srli_epi64(_mm256_add_epi64(curr_x, tmp1), 1);
+ tmp1 = _mm256_srli_epi64(_mm256_add_epi64(curr_x, tmp1), 1);
+
+ __m256i w_vec = _mm256_set1_epi64x(static_cast(w));
+ __m256i w_times_t_tmp = _mm256_mul_epu32(t_tmp, w_vec);
+ __m256i h_tmp_times_modulus = _mm256_mul_epu32(h_tmp, modulus_vec);
+ __m256i new_y =
+ _mm256_sub_epi64(w_times_t_tmp, h_tmp_times_modulus);
+
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(&op[j]), tmp1);
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(&op[j + t]), new_y);
+ }
+ }
+ }
+ }
+
+ for (size_t i = 0; i < n; ++i) {
+ operand[i] -= static_cast(operand[i] >= two_times_modulus) *
+ two_times_modulus;
+ operand[i] -= static_cast(operand[i] >= modulus) * modulus;
+ }
+ }
+}
+
+#endif
+
+} // namespace psi::spiral::arith
diff --git a/psi/legacy/kmprt17_mp_psi.cc b/psi/algorithm/spiral/arith/ntt.h
similarity index 52%
rename from psi/legacy/kmprt17_mp_psi.cc
rename to psi/algorithm/spiral/arith/ntt.h
index f69f656..65109b6 100644
--- a/psi/legacy/kmprt17_mp_psi.cc
+++ b/psi/algorithm/spiral/arith/ntt.h
@@ -1,4 +1,4 @@
-// Copyright 2024 zhangwfjh
+// Copyright 2024 Ant Group Co., Ltd.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -11,26 +11,18 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
+#pragma once
-#include "psi/legacy/kmprt17_mp_psi.h"
+#include
+#include
-#include
+#include "absl/types/span.h"
-#include "psi/legacy/factory.h"
+#include "psi/algorithm/spiral/params.h"
-namespace psi::psi {
+namespace psi::spiral::arith {
-namespace {
+void NttForward(const Params& params, absl::Span operand_overall);
+void NttInverse(const Params& params, absl::Span operand_overall);
-std::unique_ptr CreateOperator(
- const MemoryPsiConfig& config,
- const std::shared_ptr& lctx) {
- return std::make_unique(
- KmprtMpPsiOperator::Options({lctx, config.receiver_rank()}));
-}
-
-REGISTER_OPERATOR(KMPRT_PSI_NPC, CreateOperator);
-
-} // namespace
-
-} // namespace psi::psi
+} // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/arith/ntt_table.cc b/psi/algorithm/spiral/arith/ntt_table.cc
new file mode 100644
index 0000000..eed9346
--- /dev/null
+++ b/psi/algorithm/spiral/arith/ntt_table.cc
@@ -0,0 +1,161 @@
+
+
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/arith/ntt_table.h"
+
+#include
+
+#include "seal/modulus.h"
+#include "yacl/base/exception.h"
+
+#include "psi/algorithm/spiral/arith/number_theory.h"
+
+namespace psi::spiral::arith {
+
+std::vector ScalePowersU32(
+ std::uint32_t modulus, std::size_t poly_len,
+ const std::vector& in) {
+ std::vector scaled_powers(poly_len, 0ULL);
+
+ for (std::size_t i = 0; i < poly_len; ++i) {
+ std::uint64_t wide_val = in[i] << 32;
+ std::uint64_t quotient = wide_val / static_cast(modulus);
+ scaled_powers[i] =
+ static_cast((static_cast(quotient)));
+ }
+
+ return scaled_powers;
+}
+
+std::vector PowersOfPrimitiveRoot(std::uint64_t root,
+ std::uint64_t modulus,
+ std::size_t poly_len_log2) {
+ std::size_t poly_len = 1 << poly_len_log2;
+ std::vector root_powers(poly_len, 0ULL);
+ std::uint64_t power = root;
+
+ root_powers[0] = 1;
+ seal::Modulus mod(modulus);
+ for (std::size_t i = 1; i < poly_len; ++i) {
+ auto idx = arith::ReverseBits(i, poly_len_log2);
+ root_powers[idx] = power;
+ power = arith::MultiplyUintMod(power, root, mod);
+ }
+ return root_powers;
+}
+
+std::vector PowersOfPrimitiveRoot(std::uint64_t root,
+ const seal::Modulus& mod,
+ std::size_t poly_len_log2) {
+ std::size_t poly_len = 1 << poly_len_log2;
+ std::vector root_powers(poly_len, 0ULL);
+ std::uint64_t power = root;
+
+ root_powers[0] = 1;
+ for (std::size_t i = 1; i < poly_len; ++i) {
+ auto idx = arith::ReverseBits(i, poly_len_log2);
+ root_powers[idx] = power;
+ power = arith::MultiplyUintMod(power, root, mod);
+ }
+ return root_powers;
+}
+
+NttTables BuildNttTables(std::size_t poly_len,
+ const std::vector& moduli) {
+ YACL_ENFORCE(poly_len > 0);
+ YACL_ENFORCE(moduli.size() > 0);
+
+ std::size_t poly_len_log2 = arith::Log2(poly_len);
+
+ NttTables tables;
+
+ for (std::size_t i = 0; i < moduli.size(); ++i) {
+ std::uint64_t modulus = moduli[i];
+ seal::Modulus mod(modulus);
+ // todo: why need convert? maybe reduce error?
+ auto modulus_u32 = static_cast(modulus);
+
+ std::uint64_t root = arith::GetMinimalPrimitiveRoot(2 * poly_len, mod);
+ std::uint64_t inv_root = arith::InvertUintMod(root, mod);
+
+ auto root_powers = PowersOfPrimitiveRoot(root, mod, poly_len_log2);
+
+ auto scaled_root_powers =
+ ScalePowersU32(modulus_u32, poly_len, root_powers);
+
+ auto inv_root_power = PowersOfPrimitiveRoot(inv_root, mod, poly_len_log2);
+
+ for (std::size_t j = 0; j < poly_len; ++j) {
+ inv_root_power[j] = arith::Div2UintMod(inv_root_power[j], mod);
+ }
+
+ auto scaled_inv_root_powers =
+ ScalePowersU32(modulus_u32, poly_len, inv_root_power);
+
+ std::vector> temp{
+ std::move(root_powers), std::move(scaled_root_powers),
+ std::move(inv_root_power), std::move(scaled_inv_root_powers)};
+
+ tables.emplace_back(std::move(temp));
+ }
+
+ return tables;
+}
+
+NttTables BuildNttTables(std::size_t poly_len,
+ const std::vector& moduli) {
+ YACL_ENFORCE(poly_len > 0);
+ YACL_ENFORCE(moduli.size() > 0);
+
+ std::size_t poly_len_log2 = arith::Log2(poly_len);
+
+ NttTables tables;
+
+ for (std::size_t i = 0; i < moduli.size(); ++i) {
+ std::uint64_t modulus = moduli[i].value();
+ // todo: why need convert? maybe reduce error?
+ auto modulus_u32 = static_cast(modulus);
+
+ std::uint64_t root =
+ arith::GetMinimalPrimitiveRoot(2 * poly_len, moduli[i]);
+ std::uint64_t inv_root = arith::InvertUintMod(root, moduli[i]);
+
+ auto root_powers = PowersOfPrimitiveRoot(root, moduli[i], poly_len_log2);
+
+ auto scaled_root_powers =
+ ScalePowersU32(modulus_u32, poly_len, root_powers);
+
+ auto inv_root_power =
+ PowersOfPrimitiveRoot(inv_root, moduli[i], poly_len_log2);
+
+ for (std::size_t j = 0; j < poly_len; ++j) {
+ inv_root_power[j] = arith::Div2UintMod(inv_root_power[j], moduli[i]);
+ }
+
+ auto scaled_inv_root_powers =
+ ScalePowersU32(modulus_u32, poly_len, inv_root_power);
+
+ std::vector> temp{
+ std::move(root_powers), std::move(scaled_root_powers),
+ std::move(inv_root_power), std::move(scaled_inv_root_powers)};
+
+ tables.emplace_back(std::move(temp));
+ }
+
+ return tables;
+}
+
+} // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/arith/ntt_table.h b/psi/algorithm/spiral/arith/ntt_table.h
new file mode 100644
index 0000000..8804e09
--- /dev/null
+++ b/psi/algorithm/spiral/arith/ntt_table.h
@@ -0,0 +1,40 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include
+#include
+#include
+
+#include "seal/modulus.h"
+
+namespace psi::spiral::arith {
+
+using NttTables = std::vector>>;
+
+NttTables BuildNttTables(std::size_t poly_len,
+ const std::vector& moduli);
+
+NttTables BuildNttTables(std::size_t poly_len,
+ const std::vector& moduli);
+
+std::vector ScalePowersU32(std::uint32_t modulus,
+ std::size_t poly_len,
+ const std::vector& in);
+
+std::vector PowersOfPrimitiveRoot(std::uint64_t root,
+ std::uint64_t modulus,
+ std::size_t poly_len_log2);
+
+} // namespace psi::spiral::arith
\ No newline at end of file
diff --git a/psi/algorithm/spiral/arith/ntt_table_test.cc b/psi/algorithm/spiral/arith/ntt_table_test.cc
new file mode 100644
index 0000000..05fea79
--- /dev/null
+++ b/psi/algorithm/spiral/arith/ntt_table_test.cc
@@ -0,0 +1,127 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/arith/ntt_table.h"
+
+#include
+#include
+#include
+#include
+
+#include "gtest/gtest.h"
+#include "yacl/base/aligned_vector.h"
+
+#include "psi/algorithm/spiral/arith/ntt.h"
+#include "psi/algorithm/spiral/params.h"
+#include "psi/algorithm/spiral/util.h"
+
+namespace psi::spiral::arith {
+
+namespace {
+
+constexpr std::uint64_t kRefVal{519370102};
+
+constexpr std::size_t kMaxLoop{100};
+
+} // namespace
+
+TEST(NttTest, BuildNttTables) {
+ std::vector moduli{268369921ULL, 249561089ULL};
+ std::size_t poly_len{2048};
+
+ NttTables res = arith::BuildNttTables(poly_len, moduli);
+
+ ASSERT_EQ(2, res.size());
+ ASSERT_EQ(4, res[0].size());
+ ASSERT_EQ(poly_len, res[0][0].size());
+
+ ASSERT_EQ(134184961, res[0][2][0]);
+ ASSERT_EQ(96647580, res[0][2][1]);
+
+ std::uint64_t x1 = 0;
+ for (std::size_t i = 0; i < res.size(); ++i) {
+ for (std::size_t j = 0; j < res[0].size(); ++j) {
+ for (std::size_t k = 0; k < res[0][0].size(); ++k) {
+ x1 ^= res[i][j][k];
+ }
+ }
+ }
+ ASSERT_EQ(kRefVal, x1);
+}
+
+TEST(NttTest, NttForward) {
+ auto params = util::GetFastExpansionTestingParam();
+
+ std::vector v1(2 * 2048, 0);
+ v1[0] = 100;
+ v1[2048] = 100;
+
+ arith::NttForward(params, absl::MakeSpan(v1));
+ ASSERT_EQ(v1[50], 100);
+ ASSERT_EQ(v1[2048 + 50], 100);
+}
+
+TEST(NttTest, NttInverse) {
+ auto params = util::GetFastExpansionTestingParam();
+
+ std::vector v1(2 * 2048, 100);
+ arith::NttInverse(params, absl::MakeSpan(v1));
+ ASSERT_EQ(v1[0], 100);
+ ASSERT_EQ(v1[2048], 100);
+ ASSERT_EQ(v1[50], 0);
+ ASSERT_EQ(v1[2048 + 50], 0);
+}
+
+TEST(NttTest, NttCorrect) {
+ auto params = util::GetFastExpansionTestingParam();
+
+ std::vector v1(params.CrtCount() * params.PolyLen());
+ std::random_device rd;
+ std::mt19937_64 prg(rd());
+
+ uint64_t total_time = 0;
+
+ for (size_t l = 0; l < kMaxLoop; ++l) {
+ for (size_t i = 0; i < params.CrtCount(); ++i) {
+ for (size_t j = 0; j < params.PolyLen(); ++j) {
+ std::vector indices{i, j};
+ std::vector lengths{params.CrtCount(), params.PolyLen()};
+ auto idx = util::CalcIndex(indices, lengths);
+ uint64_t val = prg();
+ v1[idx] = val % params.Moduli(i);
+ }
+ }
+ // copy
+ std::vector v2(v1.begin(), v1.end());
+
+ auto start = std::chrono::high_resolution_clock::now();
+ // forward
+ arith::NttForward(params, absl::MakeSpan(v2));
+ // inverse
+ arith::NttInverse(params, absl::MakeSpan(v2));
+ auto end = std::chrono::high_resolution_clock::now();
+
+ auto duration =
+ std::chrono::duration_cast(end - start);
+
+ total_time += duration.count();
+
+ ASSERT_EQ(v2, v1);
+ }
+
+ SPDLOG_INFO("{} Ntts, total time: {} micro-sec, each Ntt, time: {}", kMaxLoop,
+ total_time, static_cast(total_time) / kMaxLoop);
+}
+
+} // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/arith/number_theory.h b/psi/algorithm/spiral/arith/number_theory.h
new file mode 100644
index 0000000..747d8d6
--- /dev/null
+++ b/psi/algorithm/spiral/arith/number_theory.h
@@ -0,0 +1,94 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "seal/seal.h"
+#include "seal/util/numth.h"
+#include "yacl/base/exception.h"
+
+#include "psi/algorithm/spiral/arith/arith.h"
+
+namespace psi::spiral::arith {
+
+inline bool IsPrimitiveRoot(std::uint64_t root, std::uint64_t degree,
+ std::uint64_t modulus) {
+ if (root == 0) {
+ return false;
+ }
+ return arith::ExponentitateUintMod(root, degree >> 1, modulus) == modulus - 1;
+}
+
+inline std::uint64_t GetPrimitiveRoot(std::uint64_t degree,
+ std::uint64_t modulus) {
+ YACL_ENFORCE(modulus > 1);
+ YACL_ENFORCE(degree >= 2);
+
+ std::uint64_t result = 0ULL;
+ // todo: consider remove seal Modulus usage
+ seal::Modulus mod(modulus);
+ // return must be true
+ YACL_ENFORCE(seal::util::try_primitive_root(degree, mod, result),
+ "{} mod {} primitive root do not exits", degree, modulus);
+
+ return result;
+}
+
+inline std::uint64_t GetPrimitiveRoot(std::uint64_t degree,
+ const seal::Modulus& mod) {
+ YACL_ENFORCE(mod.value() > 1);
+ YACL_ENFORCE(degree >= 2);
+
+ std::uint64_t result = 0ULL;
+ // return must be true
+ YACL_ENFORCE(seal::util::try_primitive_root(degree, mod, result),
+ "{} mod {} primitive root do not exits", degree, mod.value());
+
+ return result;
+}
+
+inline std::uint64_t GetMinimalPrimitiveRoot(std::uint64_t degree,
+ std::uint64_t modulus) {
+ std::uint64_t result{0};
+ // todo: consider remove seal Modulus usage
+ seal::Modulus mod(modulus);
+ // return must be true
+ YACL_ENFORCE(seal::util::try_minimal_primitive_root(degree, mod, result));
+ return result;
+}
+
+inline std::uint64_t GetMinimalPrimitiveRoot(std::uint64_t degree,
+ const seal::Modulus& mod) {
+ std::uint64_t result{0};
+ // return must be true
+ YACL_ENFORCE(seal::util::try_minimal_primitive_root(degree, mod, result));
+ return result;
+}
+
+inline std::uint64_t InvertUintMod(std::uint64_t value, std::uint64_t modulus) {
+ YACL_ENFORCE(value > 0);
+ seal::Modulus mod(modulus);
+ std::uint64_t result{0};
+ YACL_ENFORCE(seal::util::try_invert_uint_mod(value, mod, result));
+ return result;
+}
+
+inline std::uint64_t InvertUintMod(std::uint64_t value,
+ const seal::Modulus& mod) {
+ YACL_ENFORCE(value > 0);
+ std::uint64_t result{0};
+ YACL_ENFORCE(seal::util::try_invert_uint_mod(value, mod, result));
+ return result;
+}
+
+} // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/arith/number_theory_test.cc b/psi/algorithm/spiral/arith/number_theory_test.cc
new file mode 100644
index 0000000..fafaf78
--- /dev/null
+++ b/psi/algorithm/spiral/arith/number_theory_test.cc
@@ -0,0 +1,80 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "psi/algorithm/spiral/arith/number_theory.h"
+
+#include
+
+#include "gtest/gtest.h"
+#include "yacl/base/exception.h"
+
+namespace psi::spiral::arith {
+
+TEST(NumberTheoryTest, IsPrimitiveRoot) {
+ std::uint64_t modulus{11};
+ ASSERT_TRUE(arith::IsPrimitiveRoot(10, 2, modulus));
+ ASSERT_FALSE(arith::IsPrimitiveRoot(9, 2, modulus));
+ ASSERT_FALSE(arith::IsPrimitiveRoot(10, 4, modulus));
+}
+
+TEST(NumberTheoryTest, GetPrimitiveRoot) {
+ std::uint64_t modulus{11};
+
+ ASSERT_EQ(10, arith::GetPrimitiveRoot(2, modulus));
+ // primitive root do not exist
+ ASSERT_THROW(arith::GetPrimitiveRoot(3, modulus), yacl::EnforceNotMet);
+
+ modulus = 29;
+ ASSERT_EQ(28, arith::GetPrimitiveRoot(2, modulus));
+
+ std::vector corrects{12, 17};
+ ASSERT_TRUE(std::find(corrects.begin(), corrects.end(),
+ arith::GetPrimitiveRoot(4, modulus)) != corrects.end());
+}
+
+TEST(NumberTheoryTest, GetMinimalPrimitiveRoot) {
+ std::uint64_t modulus{11};
+ ASSERT_EQ(10, arith::GetMinimalPrimitiveRoot(2, modulus));
+
+ modulus = 29;
+ ASSERT_EQ(28, arith::GetMinimalPrimitiveRoot(2, modulus));
+ ASSERT_EQ(12, arith::GetMinimalPrimitiveRoot(4, modulus));
+
+ modulus = 1234565441;
+ ASSERT_EQ(1234565440ULL, arith::GetMinimalPrimitiveRoot(2, modulus));
+ ASSERT_EQ(249725733ULL, arith::GetMinimalPrimitiveRoot(8, modulus));
+}
+
+TEST(NumberTheoryTest, InvertUintMod) {
+ std::uint64_t modulus;
+ std::uint64_t input;
+
+ input = 1;
+ modulus = 2;
+ ASSERT_EQ(1, arith::InvertUintMod(input, modulus));
+
+ input = 2;
+ modulus = 2;
+ ASSERT_THROW(arith::InvertUintMod(input, modulus), yacl::EnforceNotMet);
+
+ input = 3;
+ modulus = 2;
+ ASSERT_EQ(1, arith::InvertUintMod(input, modulus));
+
+ input = 0xFFFFFF;
+ modulus = 2;
+ ASSERT_EQ(1, arith::InvertUintMod(input, modulus));
+}
+
+} // namespace psi::spiral::arith
diff --git a/psi/algorithm/spiral/common.h b/psi/algorithm/spiral/common.h
new file mode 100644
index 0000000..4a217fa
--- /dev/null
+++ b/psi/algorithm/spiral/common.h
@@ -0,0 +1,139 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "absl/types/span.h"
+#include "spdlog/spdlog.h"
+
+namespace psi::spiral {
+
+template