diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100644 index 1df96737..00000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,6 +0,0 @@ -before_script: - - whoami; docker info - -build_image: - script: - - docker build -f docker/ubuntu18/DockerFile --build-arg http_proxy="$http_proxy" --build-arg https_proxy="$http_proxy" --build-arg MAVEN_OPTS="$MAVEN_OPTS" . diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..d62e6e63 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "rpmp/include/spdlog"] + path = rpmp/include/spdlog + url = https://github.com/gabime/spdlog.git + branch = master diff --git a/core/src/main/scala/org/apache/spark/storage/pmof/RdmaShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/pmof/RdmaShuffleBlockFetcherIterator.scala index 15b2a3ae..d3c3e064 100644 --- a/core/src/main/scala/org/apache/spark/storage/pmof/RdmaShuffleBlockFetcherIterator.scala +++ b/core/src/main/scala/org/apache/spark/storage/pmof/RdmaShuffleBlockFetcherIterator.scala @@ -357,7 +357,6 @@ final class RdmaShuffleBlockFetcherIterator(context: TaskContext, reqsInFlight.incrementAndGet val blockManagerId = rdmaRequest.blockManagerId val shuffleBlockIdName = rdmaRequest.shuffleBlockIdName - println("shuffle block name " + shuffleBlockIdName) val pmofTransferService = shuffleClient.asInstanceOf[PmofTransferService] diff --git a/rpmp/CMakeLists.txt b/rpmp/CMakeLists.txt new file mode 100644 index 00000000..0609bd26 --- /dev/null +++ b/rpmp/CMakeLists.txt @@ -0,0 +1,45 @@ +cmake_minimum_required(VERSION 3.11) +project(rpmof) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "-std=c++14 -g -pthread -fPIC") + +# Generate compile_commands.json +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# place binaries and libraries according to GNU standards +include(GNUInstallDirs) +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}) + +if(CMAKE_CXX_COMPILER_ID MATCHES GNU) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage") +endif() + +include(cmake/googletest.cmake) + +fetch_googletest( + ${PROJECT_SOURCE_DIR}/cmake + ${PROJECT_BINARY_DIR}/googletest +) + +find_package(Boost REQUIRED COMPONENTS program_options) +if(Boost_FOUND) + include_directories(${Boost_INCLUDE_DIRS}) +endif() + +include_directories(${PROJECT_SOURCE_DIR}/) +include_directories(${PROJECT_SOURCE_DIR}/include) +include_directories(${PROJECT_SOURCE_DIR}/include/spdlog/include) +include_directories(${PROJECT_BINARY_DIR}/googletest/googletest-src/googletest/include) + +enable_testing() + +add_subdirectory(include/spdlog) +add_subdirectory(pmpool) +add_subdirectory(test) +add_subdirectory(benchmark) + +add_executable(main main.cc) +target_link_libraries(main pmpool spdlog) diff --git a/rpmp/README.md b/rpmp/README.md new file mode 100644 index 00000000..a217906e --- /dev/null +++ b/rpmp/README.md @@ -0,0 +1 @@ +# RPMP diff --git a/rpmp/benchmark/CMakeLists.txt b/rpmp/benchmark/CMakeLists.txt new file mode 100644 index 00000000..468d6b15 --- /dev/null +++ b/rpmp/benchmark/CMakeLists.txt @@ -0,0 +1,17 @@ +add_executable(local_allocate local_allocate.cc) +target_link_libraries(local_allocate pmpool) + +add_executable(remote_allocate remote_allocate.cc) +target_link_libraries(remote_allocate pmpool) + +add_executable(remote_write remote_write.cc) +target_link_libraries(remote_write pmpool) + +add_executable(remote_allocate_write remote_allocate_write.cc) +target_link_libraries(remote_allocate_write pmpool) + +add_executable(circularbuffer circularbuffer.cc) +target_link_libraries(circularbuffer pmpool) + +add_executable(remote_read remote_read.cc) +target_link_libraries(remote_read pmpool) diff --git a/rpmp/benchmark/circularbuffer.cc b/rpmp/benchmark/circularbuffer.cc new file mode 100644 index 00000000..35ee8047 --- /dev/null +++ b/rpmp/benchmark/circularbuffer.cc @@ -0,0 +1,35 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/benchmark/circularbuffer.cc + * Path: /mnt/spark-pmof/tool/rpmp/benchmark + * Created Date: Monday, December 30th 2019, 9:57:10 am + * Author: root + * + * Copyright (c) 2019 Your Company + */ + +#include + +#include "pmpool/buffer/CircularBuffer.h" + +#include // NOLINT + +uint64_t timestamp_now() { + return std::chrono::high_resolution_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); +} + +int main() { + CircularBuffer circularbuffer(1024 * 1024, 2048); + uint64_t start = timestamp_now(); + char str[1048576]; + memset(str, '0', 1048576); + for (int i = 0; i < 20480; i++) { + char* buf = circularbuffer.get(1048576); + memcpy(buf, str, 1048576); + circularbuffer.put(buf, 1048576); + } + uint64_t end = timestamp_now(); + std::cout << "pmemkv put test: 1048576 " + << " bytes test, consumes " << (end - start) / 1000.0 << std::endl; + return 0; +} \ No newline at end of file diff --git a/rpmp/benchmark/local_allocate.cc b/rpmp/benchmark/local_allocate.cc new file mode 100644 index 00000000..8faadd68 --- /dev/null +++ b/rpmp/benchmark/local_allocate.cc @@ -0,0 +1,85 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/benchmark/local_allocate.cc + * Path: /mnt/spark-pmof/tool/rpmp/benchmark + * Created Date: Tuesday, December 24th 2019, 8:54:38 am + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include + +#include +#include +#include // NOLINT +#include // NOLINT +#include + +#include "../pmpool/AllocatorProxy.h" +#include "../pmpool/Config.h" +#include "../pmpool/Log.h" +#include "gtest/gtest.h" + +uint64_t timestamp_now() { + return std::chrono::high_resolution_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); +} + +std::mutex mtx; +uint64_t count = 0; +char str[1048576]; + +void func(AllocatorProxy *proxy, int index) { + while (true) { + std::unique_lock lk(mtx); + uint64_t count_ = count++; + lk.unlock(); + if (count_ < 20480) { + uint64_t addr = proxy->allocate_and_write(1048576, nullptr, index); + proxy->write(addr, str, 1048576); + } else { + break; + } + } +} + +int main() { + std::shared_ptr config = std::make_shared(); + config->init(0, nullptr); + std::shared_ptr log = std::make_shared(config.get()); + auto allocatorProxy = new AllocatorProxy(config.get(), log.get(), nullptr); + allocatorProxy->init(); + std::vector threads; + memset(str, '0', 1048576); + + uint64_t start = timestamp_now(); + int num = 0; + for (int i = 0; i < 4; i++) { + num++; + auto t = new std::thread(func, allocatorProxy, i); + threads.push_back(t); + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + if (i == 0) { + CPU_SET(2, &cpuset); + } else if (i == 1) { + CPU_SET(40, &cpuset); + } else if (i == 2) { + CPU_SET(27, &cpuset); + } else { + CPU_SET(60, &cpuset); + } + int rc = + pthread_setaffinity_np(t->native_handle(), sizeof(cpu_set_t), &cpuset); + } + for (int i = 0; i < num; i++) { + threads[i]->join(); + delete threads[i]; + } + uint64_t end = timestamp_now(); + std::cout << "pmemkv put test: 1048576 " + << " bytes test, consumes " << (end - start) / 1000.0 + << "s, throughput is " << 20480 / ((end - start) / 1000.0) << "MB/s" + << std::endl; + allocatorProxy->release_all(); +} diff --git a/rpmp/benchmark/remote_allocate.cc b/rpmp/benchmark/remote_allocate.cc new file mode 100644 index 00000000..7762783a --- /dev/null +++ b/rpmp/benchmark/remote_allocate.cc @@ -0,0 +1,86 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/benchmark/allocate_perf.cc + * Path: /mnt/spark-pmof/tool/rpmp/benchmark + * Created Date: Friday, December 20th 2019, 8:29:23 am + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include // NOLINT +#include +#include "pmpool/client/PmPoolClient.h" + +uint64_t timestamp_now() { + return std::chrono::high_resolution_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); +} + +std::atomic count = {0}; +std::mutex mtx; +std::vector clients; +std::map> addresses; + +void func(int i) { + while (true) { + uint64_t count_ = count++; + if (count_ < 20480) { + clients[i]->begin_tx(); + if (addresses.count(i) != 0) { + auto vec = addresses[i]; + uint64_t addr = clients[i]->alloc(1048576); + vec.push_back(addr); + } else { + std::vector vec; + uint64_t addr = clients[i]->alloc(1048576); + vec.push_back(addr); + addresses[i] = vec; + } + clients[i]->end_tx(); + } else { + break; + } + } +} + +int main() { + std::vector threads; + int num = 0; + for (int i = 0; i < 4; i++) { + PmPoolClient *client = new PmPoolClient("172.168.0.40", "12346"); + client->begin_tx(); + client->init(); + client->end_tx(); + clients.push_back(client); + num++; + } + uint64_t start = timestamp_now(); + for (int i = 0; i < num; i++) { + auto t = new std::thread(func, i); + threads.push_back(t); + } + for (int i = 0; i < num; i++) { + threads[i]->join(); + delete threads[i]; + } + uint64_t end = timestamp_now(); + std::cout << "pmemkv put test: 1048576 " + << " bytes test, consumes " << (end - start) / 1000.0 + << "s, throughput is " << 20480 / ((end - start) / 1000.0) << "MB/s" + << std::endl; + + for (int i = 0; i < num; i++) { + auto vec = addresses[i]; + while (!vec.empty()) { + auto address = vec.back(); + vec.pop_back(); + clients[i]->free(address); + } + } + std::cout << "freed." << std::endl; + for (int i = 0; i < num; i++) { + clients[i]->wait(); + delete clients[i]; + } + return 0; +} diff --git a/rpmp/benchmark/remote_allocate_write.cc b/rpmp/benchmark/remote_allocate_write.cc new file mode 100644 index 00000000..2a1f5da7 --- /dev/null +++ b/rpmp/benchmark/remote_allocate_write.cc @@ -0,0 +1,90 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/benchmark/allocate_perf.cc + * Path: /mnt/spark-pmof/tool/rpmp/benchmark + * Created Date: Friday, December 20th 2019, 8:29:23 am + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include +#include // NOLINT +#include +#include "pmpool/client/PmPoolClient.h" + +uint64_t timestamp_now() { + return std::chrono::high_resolution_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); +} + +std::atomic count = {0}; +std::mutex mtx; +char str[1048576]; +std::vector clients; +std::map> addresses; + +void func1(int i) { + while (true) { + uint64_t count_ = count++; + if (count_ < 20480) { + clients[i]->begin_tx(); + if (addresses.count(i) != 0) { + auto vec = addresses[i]; + vec.push_back(clients[i]->write(str, 1048576)); + } else { + std::vector vec; + vec.push_back(clients[i]->write(str, 1048576)); + addresses[i] = vec; + } + clients[i]->end_tx(); + } else { + break; + } + } +} + +int main() { + std::vector threads; + memset(str, '0', 1048576); + + int num = 0; + std::cout << "start write." << std::endl; + num = 0; + count = 0; + for (int i = 0; i < 4; i++) { + PmPoolClient *client = new PmPoolClient("172.168.0.40", "12346"); + client->begin_tx(); + client->init(); + client->end_tx(); + clients.push_back(client); + num++; + } + uint64_t start = timestamp_now(); + for (int i = 0; i < num; i++) { + auto t = new std::thread(func1, i); + threads.push_back(t); + } + for (int i = 0; i < num; i++) { + threads[i]->join(); + delete threads[i]; + } + uint64_t end = timestamp_now(); + std::cout << "pmemkv put test: 1048576 " + << " bytes test, consumes " << (end - start) / 1000.0 + << "s, throughput is " << 20480 / ((end - start) / 1000.0) << "MB/s" + << std::endl; + for (int i = 0; i < num; i++) { + auto vec = addresses[i]; + while (!vec.empty()) { + auto address = vec.back(); + vec.pop_back(); + clients[i]->free(address); + } + } + std::cout << "freed." << std::endl; + for (int i = 0; i < num; i++) { + clients[i]->wait(); + delete clients[i]; + } + return 0; +} diff --git a/rpmp/benchmark/remote_read.cc b/rpmp/benchmark/remote_read.cc new file mode 100644 index 00000000..ec1c74af --- /dev/null +++ b/rpmp/benchmark/remote_read.cc @@ -0,0 +1,82 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/benchmark/allocate_perf.cc + * Path: /mnt/spark-pmof/tool/rpmp/benchmark + * Created Date: Friday, December 20th 2019, 8:29:23 am + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include +#include +#include // NOLINT +#include "pmpool/Event.h" +#include "pmpool/client/PmPoolClient.h" + +char str[1048576]; +char str_read[1048576]; +std::atomic count = {0}; +std::mutex mtx; +std::vector clients; +std::vector addresses; +uint64_t buffer_size = 1024*64; +uint64_t buffer_num = 1000000; +int thread_num = 1; + +uint64_t timestamp_now() { + return std::chrono::high_resolution_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); +} + +void func(uint64_t i) { + while (true) { + uint64_t count_ = count++; + if (count_ < buffer_num) { + clients[i]->begin_tx(); + clients[i]->read(addresses[i], str_read, buffer_size); + clients[i]->end_tx(); + } else { + break; + } + } +} + +int main() { + std::vector threads; + + memset(str, '0', buffer_size); + for (int i = 0; i < thread_num; i++) { + PmPoolClient *client = new PmPoolClient("172.168.0.40", "12346"); + client->init(); + client->begin_tx(); + addresses.push_back(client->write(str, buffer_size)); + client->end_tx(); + clients.push_back(client); + } + uint64_t start = timestamp_now(); + for (int i = 0; i < thread_num; i++) { + auto t = new std::thread(func, i); + threads.push_back(t); + } + for (int i = 0; i < thread_num; i++) { + threads[i]->join(); + delete threads[i]; + } + uint64_t end = timestamp_now(); + std::cout << "pmemkv put test: " << buffer_size << " " + << " bytes test, consumes " << (end - start) / 1000.0 + << "s, throughput is " + << buffer_num / 1024.0 * buffer_size / 1024.0 / + ((end - start) / 1000.0) + << "MB/s" << std::endl; + for (int i = 0; i < thread_num; i++) { + clients[i]->begin_tx(); + clients[i]->free(addresses[i]); + clients[i]->end_tx(); + } + std::cout << "finished." << std::endl; + for (int i = 0; i < thread_num; i++) { + clients[i]->wait(); + } + return 0; +} diff --git a/rpmp/benchmark/remote_write.cc b/rpmp/benchmark/remote_write.cc new file mode 100644 index 00000000..f2fb1d0f --- /dev/null +++ b/rpmp/benchmark/remote_write.cc @@ -0,0 +1,98 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/benchmark/allocate_perf.cc + * Path: /mnt/spark-pmof/tool/rpmp/benchmark + * Created Date: Friday, December 20th 2019, 8:29:23 am + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include +#include // NOLINT +#include "pmpool/client/PmPoolClient.h" + +uint64_t timestamp_now() { + return std::chrono::high_resolution_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); +} + +int count = 0; +std::mutex mtx; +uint64_t addresses[20480]; +char str[1048576]; + +void func(PmPoolClient* client) { + while (true) { + std::unique_lock lk(mtx); + uint64_t count_ = count++; + lk.unlock(); + if (count_ < 20480) { + client->begin_tx(); + auto addr = client->alloc(1048576); + client->end_tx(); + addresses[count_] = addr; + } else { + break; + } + } +} + +void func1(PmPoolClient* client) { + while (true) { + std::unique_lock lk(mtx); + uint64_t count_ = count++; + lk.unlock(); + if (count_ < 20480) { + client->begin_tx(); + client->write(addresses[count_], str, 1048576); + client->end_tx(); + } else { + break; + } + } +} + +int main() { + std::vector threads; + PmPoolClient client("172.168.0.40", "12346"); + memset(str, '0', 1048576); + client.init(); + + int num = 0; + for (int i = 0; i < 1; i++) { + num++; + auto t = new std::thread(func, &client); + threads.push_back(t); + } + for (int i = 0; i < num; i++) { + threads[i]->join(); + delete threads[i]; + } + std::cout << "start write." << std::endl; + num = 0; + count = 0; + std::vector threads_1; + uint64_t start = timestamp_now(); + for (int i = 0; i < 8; i++) { + num++; + auto t = new std::thread(func1, &client); + threads_1.push_back(t); + } + for (int i = 0; i < num; i++) { + threads_1[i]->join(); + delete threads_1[i]; + } + uint64_t end = timestamp_now(); + std::cout << "pmemkv put test: 1048576 " + << " bytes test, consumes " << (end - start) / 1000.0 + << "s, throughput is " << 20480 / ((end - start) / 1000.0) << "MB/s" + << std::endl; + for (int i = 0; i < 20480; i++) { + client.begin_tx(); + client.free(addresses[i]); + client.end_tx(); + } + std::cout << "freed." << std::endl; + client.wait(); + return 0; +} diff --git a/rpmp/cmake/googletest-download.cmake b/rpmp/cmake/googletest-download.cmake new file mode 100644 index 00000000..313be3c8 --- /dev/null +++ b/rpmp/cmake/googletest-download.cmake @@ -0,0 +1,20 @@ +# code copied from https://crascit.com/2015/07/25/cmake-gtest/ +cmake_minimum_required(VERSION 3.5 FATAL_ERROR) + +project(googletest-download NONE) + +include(ExternalProject) + +ExternalProject_Add( + googletest + SOURCE_DIR "@GOOGLETEST_DOWNLOAD_ROOT@/googletest-src" + BINARY_DIR "@GOOGLETEST_DOWNLOAD_ROOT@/googletest-build" + GIT_REPOSITORY + https://github.com/google/googletest.git + GIT_TAG + release-1.8.1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" + ) diff --git a/rpmp/cmake/googletest.cmake b/rpmp/cmake/googletest.cmake new file mode 100644 index 00000000..5ca70908 --- /dev/null +++ b/rpmp/cmake/googletest.cmake @@ -0,0 +1,32 @@ +# the following code to fetch googletest +# is inspired by and adapted after https://crascit.com/2015/07/25/cmake-gtest/ +# download and unpack googletest at configure time + +macro(fetch_googletest _download_module_path _download_root) + set(GOOGLETEST_DOWNLOAD_ROOT ${_download_root}) + configure_file( + ${_download_module_path}/googletest-download.cmake + ${_download_root}/CMakeLists.txt + @ONLY + ) + unset(GOOGLETEST_DOWNLOAD_ROOT) + + execute_process( + COMMAND + "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" . + WORKING_DIRECTORY + ${_download_root} + ) + execute_process( + COMMAND + "${CMAKE_COMMAND}" --build . + WORKING_DIRECTORY + ${_download_root} + ) + + # adds the targers: gtest, gtest_main, gmock, gmock_main + add_subdirectory( + ${_download_root}/googletest-src + ${_download_root}/googletest-build + ) +endmacro() diff --git a/rpmp/include/xxhash/xxhash.h b/rpmp/include/xxhash/xxhash.h new file mode 100644 index 00000000..01488ef5 --- /dev/null +++ b/rpmp/include/xxhash/xxhash.h @@ -0,0 +1,321 @@ +#pragma once +#pragma clang system_header +#pragma gcc system_header +/* +xxHash - Extremely Fast Hash algorithm +Header File +Copyright (C) 2012-2016, Yann Collet. +BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +You can contact the author at : +- xxHash source repository : https://github.com/Cyan4973/xxHash +*/ + +/* Notice extracted from xxHash homepage : +xxHash is an extremely fast Hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. +A 64-bit version, named XXH64, is available since r35. +It offers much better speed, but for 64-bit applications only. +Name Speed on 64 bits Speed on 32 bits +XXH64 13.8 GB/s 1.9 GB/s +XXH32 6.8 GB/s 6.0 GB/s +*/ + +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +#if defined (__cplusplus) +extern "C" { +#endif + + + /* **************************** + * Definitions + ******************************/ +#include /* size_t */ + typedef enum { XXH_OK = 0, XXH_ERROR } XXH_errorcode; + + + /* **************************** + * API modifier + ******************************/ + /** XXH_INLINE_ALL (and XXH_PRIVATE_API) + * This is useful to include xxhash functions in `static` mode + * in order to inline them, and remove their symbol from the public list. + * Inlining can offer dramatic performance improvement on small keys. + * Methodology : + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * `xxhash.c` is automatically included. + * It's not useful to compile and link it as a separate module. + */ +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# ifndef XXH_STATIC_LINKING_ONLY +# define XXH_STATIC_LINKING_ONLY +# endif +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif +#else +# define XXH_PUBLIC_API /* do nothing */ +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + + /*! XXH_NAMESPACE, aka Namespace Emulation : + * + * If you want to include _and expose_ xxHash functions from within your own library, + * but also want to avoid symbol collisions with other libraries which may also include xxHash, + * + * you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library + * with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values). + * + * Note that no change is required within the calling program as long as it includes `xxhash.h` : + * regular symbol name will be automatically translated by this header. + */ +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +#endif + + + /* ************************************* + * Version + ***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 6 +#define XXH_VERSION_RELEASE 5 +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) + XXH_PUBLIC_API unsigned XXH_versionNumber(void); + + + /*-********************************************************************** + * 32-bit hash + ************************************************************************/ + typedef unsigned int XXH32_hash_t; + + /*! XXH32() : + Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input". + The memory between input & input+length must be valid (allocated and read-accessible). + "seed" can be used to alter the result predictably. + Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */ + XXH_PUBLIC_API XXH32_hash_t XXH32(const void* input, size_t length, unsigned int seed); + + /*====== Streaming ======*/ + typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ + XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); + XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); + XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + + XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed); + XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t* statePtr, const void* input, size_t length); + XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* statePtr); + + /* + * Streaming functions generate the xxHash of an input provided in multiple segments. + * Note that, for small input, they are slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * XXH state must first be allocated, using XXH*_createState() . + * + * Start a new hash by initializing state with a seed, using XXH*_reset(). + * + * Then, feed the hash state by calling XXH*_update() as many times as necessary. + * The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using XXH*_digest(). + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a digest, + * and generate some new hashes later on, by calling again XXH*_digest(). + * + * When done, free XXH state space if it was allocated dynamically. + */ + + /*====== Canonical representation ======*/ + + typedef struct { unsigned char digest[4]; } XXH32_canonical_t; + XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); + XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + /* Default result type for XXH functions are primitive unsigned 32 and 64 bits. + * The canonical representation uses human-readable write convention, aka big-endian (large digits first). + * These functions allow transformation of hash result into and from its canonical format. + * This way, hash values can be written into a file / memory, and remain comparable on different systems and programs. + */ + + +#ifndef XXH_NO_LONG_LONG + /*-********************************************************************** + * 64-bit hash + ************************************************************************/ + typedef unsigned long long XXH64_hash_t; + + /*! XXH64() : + Calculate the 64-bit hash of sequence of length "len" stored at memory address "input". + "seed" can be used to alter the result predictably. + This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark). + */ + XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, unsigned long long seed); + + /*====== Streaming ======*/ + typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ + XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); + XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); + XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); + + XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed); + XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t* statePtr, const void* input, size_t length); + XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* statePtr); + + /*====== Canonical representation ======*/ + typedef struct { unsigned char digest[8]; } XXH64_canonical_t; + XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); + XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); +#endif /* XXH_NO_LONG_LONG */ + + + +#ifdef XXH_STATIC_LINKING_ONLY + + /* ================================================================================================ + This section contains declarations which are not guaranteed to remain stable. + They may change in future versions, becoming incompatible with a different version of the library. + These declarations should only be used with static linking. + Never use them in association with dynamic linking ! + =================================================================================================== */ + + /* These definitions are only present to allow + * static allocation of XXH state, on stack or in a struct for example. + * Never **ever** use members directly. */ + +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + + struct XXH32_state_s { + uint32_t total_len_32; + uint32_t large_len; + uint32_t v1; + uint32_t v2; + uint32_t v3; + uint32_t v4; + uint32_t mem32[4]; + uint32_t memsize; + uint32_t reserved; /* never read nor write, might be removed in a future version */ + }; /* typedef'd to XXH32_state_t */ + + struct XXH64_state_s { + uint64_t total_len; + uint64_t v1; + uint64_t v2; + uint64_t v3; + uint64_t v4; + uint64_t mem64[4]; + uint32_t memsize; + uint32_t reserved[2]; /* never read nor write, might be removed in a future version */ + }; /* typedef'd to XXH64_state_t */ + +# else + + struct XXH32_state_s { + unsigned total_len_32; + unsigned large_len; + unsigned v1; + unsigned v2; + unsigned v3; + unsigned v4; + unsigned mem32[4]; + unsigned memsize; + unsigned reserved; /* never read nor write, might be removed in a future version */ + }; /* typedef'd to XXH32_state_t */ + +# ifndef XXH_NO_LONG_LONG /* remove 64-bit support */ + struct XXH64_state_s { + unsigned long long total_len; + unsigned long long v1; + unsigned long long v2; + unsigned long long v3; + unsigned long long v4; + unsigned long long mem64[4]; + unsigned memsize; + unsigned reserved[2]; /* never read nor write, might be removed in a future version */ + }; /* typedef'd to XXH64_state_t */ +# endif + +# endif + + +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# include "xxhash.c" /* include xxhash function bodies as `static`, for inlining */ +#endif + +#endif /* XXH_STATIC_LINKING_ONLY */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* XXHASH_H_5627135585666179 */ \ No newline at end of file diff --git a/rpmp/include/xxhash/xxhash.hpp b/rpmp/include/xxhash/xxhash.hpp new file mode 100644 index 00000000..81e82074 --- /dev/null +++ b/rpmp/include/xxhash/xxhash.hpp @@ -0,0 +1,719 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include + +/* +xxHash - Extremely Fast Hash algorithm +Header File +Copyright (C) 2012-2018, Yann Collet. +Copyright (C) 2017-2018, Piotr Pliszka. +All rights reserved. + +BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +You can contact the author at : +- xxHash source repository : https://github.com/Cyan4973/xxHash +- xxHash C++ port repository : https://github.com/RedSpah/xxhash_cpp +*/ + +/* ************************************* +* Tuning parameters +***************************************/ +/*!XXH_FORCE_MEMORY_ACCESS : +* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. +* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. +* The below switch allow to select different access method for improved performance. +* Method 0 (default) : use `memcpy()`. Safe and portable. +* Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). +* This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. +* Method 2 : direct access. This method doesn't depend on compiler but violate C standard. +* It can generate buggy code on targets which do not support unaligned memory accesses. +* But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) +* See http://stackoverflow.com/a/32095106/646947 for details. +* Prefer these methods in priority order (0 > 1 > 2) +*/ +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define XXH_FORCE_MEMORY_ACCESS 2 +# elif defined(__INTEL_COMPILER) || (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + + +/*!XXH_FORCE_NATIVE_FORMAT : +* By default, xxHash library provides endian-independent Hash values, based on little-endian convention. +* Results are therefore identical for little-endian and big-endian CPU. +* This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. +* Should endian-independence be of no importance for your application, you may set the #define below to 1, +* to improve speed for Big-endian CPU. +* This option has no impact on Little_Endian CPU. +*/ +#if !defined(XXH_FORCE_NATIVE_FORMAT) || (XXH_FORCE_NATIVE_FORMAT == 0) /* can be defined externally */ +# define XXH_FORCE_NATIVE_FORMAT 0 +# define XXH_CPU_LITTLE_ENDIAN 1 +#endif + + +/*!XXH_FORCE_ALIGN_CHECK : +* This is a minor performance trick, only useful with lots of very small keys. +* It means : check for aligned/unaligned input. +* The check costs one initial branch per hash; +* set it to 0 when the input is guaranteed to be aligned, +* or when alignment doesn't matter for performance. +*/ +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ +# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +/*!XXH_CPU_LITTLE_ENDIAN : +* This is a CPU endian detection macro, will be +* automatically set to 1 (little endian) if XXH_FORCE_NATIVE_FORMAT +* is left undefined, XXH_FORCE_NATIVE_FORMAT is defined to 0, or if an x86/x86_64 compiler macro is defined. +* If left undefined, endianness will be determined at runtime, at the cost of a slight one-time overhead +* and a larger overhead due to get_endian() not being constexpr. +*/ +#ifndef XXH_CPU_LITTLE_ENDIAN +# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXH_CPU_LITTLE_ENDIAN 1 +# endif +#endif + +/* ************************************* +* Compiler Specific Options +***************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +namespace xxh +{ + /* ************************************* + * Version + ***************************************/ + constexpr int cpp_version_major = 0; + constexpr int cpp_version_minor = 6; + constexpr int cpp_version_release = 5; + constexpr uint32_t version_number() { return cpp_version_major * 10000 + cpp_version_minor * 100 + cpp_version_release; } + + namespace hash_t_impl + { + /* ************************************* + * Basic Types - Detail + ***************************************/ + + using _hash32_underlying = uint32_t; + using _hash64_underlying = uint64_t; + + template + struct hash_type { using type = void; }; + template <> + struct hash_type<32> { using type = _hash32_underlying; }; + template <> + struct hash_type<64> { using type = _hash64_underlying; }; + } + + /* ************************************* + * Basic Types - Public + ***************************************/ + + template + using hash_t = typename hash_t_impl::hash_type::type; + using hash32_t = hash_t<32>; + using hash64_t = hash_t<64>; + + /* ************************************* + * Bit Functions - Public + ***************************************/ + + namespace bit_ops + { + /* **************************************** + * Intrinsics and Bit Operations + ******************************************/ + +#if defined(_MSC_VER) + inline uint32_t rotl32(uint32_t x, int32_t r) { return _rotl(x, r); } + inline uint64_t rotl64(uint64_t x, int32_t r) { return _rotl64(x, r); } +#else + inline uint32_t rotl32(uint32_t x, int32_t r) { return ((x << r) | (x >> (32 - r))); } + inline uint64_t rotl64(uint64_t x, int32_t r) { return ((x << r) | (x >> (64 - r))); } +#endif + +#if defined(_MSC_VER) /* Visual Studio */ + inline uint32_t swap32(uint32_t x) { return _byteswap_ulong(x); } + inline uint64_t swap64(uint64_t x) { return _byteswap_uint64(x); } +#elif XXH_GCC_VERSION >= 403 + inline uint32_t swap32(uint32_t x) { return __builtin_bswap32(x); } + inline uint64_t swap64(uint64_t x) { return __builtin_bswap64(x); } +#else + inline uint32_t swap32(uint32_t x) { return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) | ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff); } + inline uint64_t swap64(uint64_t x) { return ((x << 56) & 0xff00000000000000ULL) | ((x << 40) & 0x00ff000000000000ULL) | ((x << 24) & 0x0000ff0000000000ULL) | ((x << 8) & 0x000000ff00000000ULL) | ((x >> 8) & 0x00000000ff000000ULL) | ((x >> 24) & 0x0000000000ff0000ULL) | ((x >> 40) & 0x000000000000ff00ULL) | ((x >> 56) & 0x00000000000000ffULL); } +#endif + template + inline hash_t rotl(hash_t n, int32_t r) {}; + + template <> + inline hash_t<32> rotl<32>(hash_t<32> n, int32_t r) + { + return rotl32(n, r); + }; + + template <> + inline hash_t<64> rotl<64>(hash_t<64> n, int32_t r) + { + return rotl64(n, r); + }; + + template + inline hash_t swap(hash_t n) {}; + + template <> + inline hash_t<32> swap<32>(hash_t<32> n) + { + return swap32(n); + }; + + template <> + inline hash_t<64> swap<64>(hash_t<64> n) + { + return swap64(n); + }; + } + + /* ************************************* + * Memory Functions - Public + ***************************************/ + + enum class alignment : uint8_t { aligned, unaligned }; + enum class endianness : uint8_t { big_endian = 0, little_endian = 1, unspecified = 2 }; + + namespace mem_ops + { + /* ************************************* + * Memory Access + ***************************************/ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + + /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ + template + inline hash_t read_unaligned(const void* memPtr) { return *(const hash_t*)memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + + /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ + /* currently only defined for gcc and icc */ + template + using unalign = union { hash_t uval; } __attribute((packed)); + + template + inline hash_t read_unaligned(const void* memPtr) { return ((const unalign*)memPtr)->uval; } +#else + + /* portable and safe solution. Generally efficient. + * see : http://stackoverflow.com/a/32095106/646947 + */ + template + inline hash_t read_unaligned(const void* memPtr) + { + hash_t val; + memcpy(&val, memPtr, sizeof(val)); + return val; + } + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + inline hash_t<32> read32(const void* memPtr) { return read_unaligned<32>(memPtr); } + inline hash_t<64> read64(const void* memPtr) { return read_unaligned<64>(memPtr); } + + /* ************************************* + * Architecture Macros + ***************************************/ + + /* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */ + +#ifndef XXH_CPU_LITTLE_ENDIAN + + inline endianness get_endian(endianness endian) + { + static struct _dummy_t + { + std::array endian_lookup = { endianness::big_endian, endianness::little_endian, endianness::unspecified }; + const int g_one = 1; + _dummy_t() + { + endian_lookup[2] = static_cast(*(const char*)(&g_one)); + } + } _dummy; + + return _dummy.endian_lookup[(uint8_t)endian]; + } + + inline bool is_little_endian() + { + return get_endian(endianness::unspecified) == endianness::little_endian; + } + +#else + constexpr endianness get_endian(endianness endian) + { + constexpr std::array endian_lookup = { endianness::big_endian, endianness::little_endian, (XXH_CPU_LITTLE_ENDIAN) ? endianness::little_endian : endianness::big_endian }; + return endian_lookup[static_cast(endian)]; + } + + constexpr bool is_little_endian() + { + return get_endian(endianness::unspecified) == endianness::little_endian; + } + +#endif + + + + /* *************************** + * Memory reads + *****************************/ + + + template + inline hash_t readLE_align(const void* ptr, endianness endian, alignment align) + { + if (align == alignment::unaligned) + { + return endian == endianness::little_endian ? read_unaligned(ptr) : bit_ops::swap(read_unaligned(ptr)); + } + else + { + return endian == endianness::little_endian ? *reinterpret_cast*>(ptr) : bit_ops::swap(*reinterpret_cast*>(ptr)); + } + } + + template + inline hash_t readLE(const void* ptr, endianness endian) + { + return readLE_align(ptr, endian, alignment::unaligned); + } + + template + inline hash_t readBE(const void* ptr) + { + return is_little_endian() ? bit_ops::swap(read_unaligned(ptr)) : read_unaligned(ptr); + } + + template + inline alignment get_alignment(const void* input) + { + return ((XXH_FORCE_ALIGN_CHECK) && ((reinterpret_cast(input) & ((N / 8) - 1)) == 0)) ? xxh::alignment::aligned : xxh::alignment::unaligned; + } + } + + /* ******************************************************************* + * Hash functions + *********************************************************************/ + + namespace detail + { + /* ******************************************************************* + * Hash functions - Implementation + *********************************************************************/ + + constexpr static std::array primes32 = { 2654435761U, 2246822519U, 3266489917U, 668265263U, 374761393U }; + constexpr static std::array primes64 = { 11400714785074694791ULL, 14029467366897019727ULL, 1609587929392839161ULL, 9650029242287828579ULL, 2870177450012600261ULL }; + + template + constexpr hash_t PRIME(int32_t n) {}; + + template <> + constexpr hash32_t PRIME<32>(int32_t n) + { + return primes32[n - 1]; + } + + template <> + constexpr hash64_t PRIME<64>(int32_t n) + { + return primes64[n - 1]; + } + + template + inline hash_t round(hash_t seed, hash_t input) + { + seed += input * PRIME(2); + seed = bit_ops::rotl(seed, ((N == 32) ? 13 : 31)); + seed *= PRIME(1); + return seed; + } + + inline hash64_t mergeRound64(hash64_t acc, hash64_t val) + { + val = round<64>(0, val); + acc ^= val; + acc = acc * PRIME<64>(1) + PRIME<64>(4); + return acc; + } + + template + inline void endian_align_sub_mergeround([[maybe_unused]] hash_t& hash_ret, hash_t v1, hash_t v2, hash_t v3, hash_t v4) {}; + + template <> + inline void endian_align_sub_mergeround<64>(hash_t<64>& hash_ret, hash_t<64> v1, hash_t<64> v2, hash_t<64> v3, hash_t<64> v4) + { + hash_ret = mergeRound64(hash_ret, v1); + hash_ret = mergeRound64(hash_ret, v2); + hash_ret = mergeRound64(hash_ret, v3); + hash_ret = mergeRound64(hash_ret, v4); + } + + template + inline hash_t endian_align_sub_ending(hash_t hash_ret, const uint8_t* p, const uint8_t* bEnd, xxh::endianness endian, xxh::alignment align) {}; + + template <> + inline hash_t<32> endian_align_sub_ending<32>(hash_t<32> hash_ret, const uint8_t* p, const uint8_t* bEnd, xxh::endianness endian, xxh::alignment align) + { + while ((p + 4) <= bEnd) + { + hash_ret += mem_ops::readLE_align<32>(p, endian, align) * PRIME<32>(3); + hash_ret = bit_ops::rotl<32>(hash_ret, 17) * PRIME<32>(4); + p += 4; + } + + while (p < bEnd) + { + hash_ret += (*p) * PRIME<32>(5); + hash_ret = bit_ops::rotl<32>(hash_ret, 11) * PRIME<32>(1); + p++; + } + + hash_ret ^= hash_ret >> 15; + hash_ret *= PRIME<32>(2); + hash_ret ^= hash_ret >> 13; + hash_ret *= PRIME<32>(3); + hash_ret ^= hash_ret >> 16; + + return hash_ret; + } + + template <> + inline hash_t<64> endian_align_sub_ending<64>(hash_t<64> hash_ret, const uint8_t* p, const uint8_t* bEnd, xxh::endianness endian, xxh::alignment align) + { + while (p + 8 <= bEnd) + { + const hash64_t k1 = round<64>(0, mem_ops::readLE_align<64>(p, endian, align)); + hash_ret ^= k1; + hash_ret = bit_ops::rotl<64>(hash_ret, 27) * PRIME<64>(1) + PRIME<64>(4); + p += 8; + } + + if (p + 4 <= bEnd) + { + hash_ret ^= static_cast(mem_ops::readLE_align<32>(p, endian, align)) * PRIME<64>(1); + hash_ret = bit_ops::rotl<64>(hash_ret, 23) * PRIME<64>(2) + PRIME<64>(3); + p += 4; + } + + while (p < bEnd) + { + hash_ret ^= (*p) * PRIME<64>(5); + hash_ret = bit_ops::rotl<64>(hash_ret, 11) * PRIME<64>(1); + p++; + } + + hash_ret ^= hash_ret >> 33; + hash_ret *= PRIME<64>(2); + hash_ret ^= hash_ret >> 29; + hash_ret *= PRIME<64>(3); + hash_ret ^= hash_ret >> 32; + + return hash_ret; + } + + template + inline hash_t endian_align(const void* input, size_t len, hash_t seed, xxh::endianness endian, xxh::alignment align) + { + static_assert(!(N != 32 && N != 64), "You can only call endian_align in 32 or 64 bit mode."); + + const uint8_t* p = static_cast(input); + const uint8_t* bEnd = p + len; + hash_t hash_ret; + + if (len >= (N / 2)) + { + const uint8_t* const limit = bEnd - (N / 2); + hash_t v1 = seed + PRIME(1) + PRIME(2); + hash_t v2 = seed + PRIME(2); + hash_t v3 = seed + 0; + hash_t v4 = seed - PRIME(1); + + do + { + v1 = round(v1, mem_ops::readLE_align(p, endian, align)); p += (N / 8); + v2 = round(v2, mem_ops::readLE_align(p, endian, align)); p += (N / 8); + v3 = round(v3, mem_ops::readLE_align(p, endian, align)); p += (N / 8); + v4 = round(v4, mem_ops::readLE_align(p, endian, align)); p += (N / 8); + } while (p <= limit); + + hash_ret = bit_ops::rotl(v1, 1) + bit_ops::rotl(v2, 7) + bit_ops::rotl(v3, 12) + bit_ops::rotl(v4, 18); + + endian_align_sub_mergeround(hash_ret, v1, v2, v3, v4); + } + else { hash_ret = seed + PRIME(5); } + + hash_ret += static_cast>(len); + + return endian_align_sub_ending(hash_ret, p, bEnd, endian, align); + } + } + + template + hash_t xxhash(const void* input, size_t len, hash_t seed = 0, endianness endian = endianness::unspecified) + { + static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode."); + return detail::endian_align(input, len, seed, mem_ops::get_endian(endian), mem_ops::get_alignment(input)); + } + + template + hash_t xxhash(const std::basic_string& input, hash_t seed = 0, endianness endian = endianness::unspecified) + { + static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode."); + return detail::endian_align(static_cast(input.data()), input.length() * sizeof(T), seed, mem_ops::get_endian(endian), mem_ops::get_alignment(static_cast(input.data()))); + } + + template + hash_t xxhash(ContiguousIterator begin, ContiguousIterator end, hash_t seed = 0, endianness endian = endianness::unspecified) + { + static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode."); + using T = typename std::decay_t; + return detail::endian_align(static_cast(&*begin), (end - begin) * sizeof(T), seed, mem_ops::get_endian(endian), mem_ops::get_alignment(static_cast(&*begin))); + } + + template + hash_t xxhash(const std::vector& input, hash_t seed = 0, endianness endian = endianness::unspecified) + { + static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode."); + return detail::endian_align(static_cast(input.data()), input.size() * sizeof(T), seed, mem_ops::get_endian(endian), mem_ops::get_alignment(static_cast(input.data()))); + } + + template + hash_t xxhash(const std::array& input, hash_t seed = 0, endianness endian = endianness::unspecified) + { + static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode."); + return detail::endian_align(static_cast(input.data()), AN * sizeof(T), seed, mem_ops::get_endian(endian), mem_ops::get_alignment(static_cast(input.data()))); + } + + template + hash_t xxhash(const std::initializer_list& input, hash_t seed = 0, endianness endian = endianness::unspecified) + { + static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode."); + return detail::endian_align(static_cast(input.begin()), input.size() * sizeof(T), seed, mem_ops::get_endian(endian), mem_ops::get_alignment(static_cast(input.begin()))); + } + + + /* ******************************************************************* + * Hash streaming + *********************************************************************/ + enum class error_code : uint8_t { ok = 0, error }; + + template + class hash_state_t { + + uint64_t total_len = 0; + hash_t v1 = 0, v2 = 0, v3 = 0, v4 = 0; + std::array, 4> mem = {{ 0,0,0,0 }}; + uint32_t memsize = 0; + + inline error_code _update_impl(const void* input, size_t length, endianness endian) + { + const uint8_t* p = reinterpret_cast(input); + const uint8_t* const bEnd = p + length; + + if (!input) { return xxh::error_code::error; } + + total_len += length; + + if (memsize + length < (N / 2)) + { /* fill in tmp buffer */ + memcpy(reinterpret_cast(mem.data()) + memsize, input, length); + memsize += static_cast(length); + return error_code::ok; + } + + if (memsize) + { /* some data left from previous update */ + memcpy(reinterpret_cast(mem.data()) + memsize, input, (N / 2) - memsize); + + const hash_t* ptr = mem.data(); + v1 = detail::round(v1, mem_ops::readLE(ptr, endian)); ptr++; + v2 = detail::round(v2, mem_ops::readLE(ptr, endian)); ptr++; + v3 = detail::round(v3, mem_ops::readLE(ptr, endian)); ptr++; + v4 = detail::round(v4, mem_ops::readLE(ptr, endian)); + + p += (N / 2) - memsize; + memsize = 0; + } + + if (p <= bEnd - (N / 2)) + { + const uint8_t* const limit = bEnd - (N / 2); + + do + { + v1 = detail::round(v1, mem_ops::readLE(p, endian)); p += (N / 8); + v2 = detail::round(v2, mem_ops::readLE(p, endian)); p += (N / 8); + v3 = detail::round(v3, mem_ops::readLE(p, endian)); p += (N / 8); + v4 = detail::round(v4, mem_ops::readLE(p, endian)); p += (N / 8); + } while (p <= limit); + } + + if (p < bEnd) + { + memcpy(mem.data(), p, static_cast(bEnd - p)); + memsize = static_cast(bEnd - p); + } + + return error_code::ok; + } + + inline hash_t _digest_impl(endianness endian) const + { + const uint8_t* p = reinterpret_cast(mem.data()); + const uint8_t* const bEnd = reinterpret_cast(mem.data()) + memsize; + hash_t hash_ret; + + if (total_len > (N / 2)) + { + hash_ret = bit_ops::rotl(v1, 1) + bit_ops::rotl(v2, 7) + bit_ops::rotl(v3, 12) + bit_ops::rotl(v4, 18); + + detail::endian_align_sub_mergeround(hash_ret, v1, v2, v3, v4); + } + else { hash_ret = v3 + detail::PRIME(5); } + + hash_ret += static_cast>(total_len); + + return detail::endian_align_sub_ending(hash_ret, p, bEnd, endian, alignment::unaligned); + } + + public: + hash_state_t(hash_t seed = 0) + { + static_assert(!(N != 32 && N != 64), "You can only stream hashing in 32 or 64 bit mode."); + v1 = seed + detail::PRIME(1) + detail::PRIME(2); + v2 = seed + detail::PRIME(2); + v3 = seed + 0; + v4 = seed - detail::PRIME(1); + }; + + hash_state_t operator=(hash_state_t& other) + { + memcpy(this, other, sizeof(hash_state_t)); + } + + error_code reset(hash_t seed = 0) + { + memset(this, 0, sizeof(hash_state_t)); + v1 = seed + detail::PRIME(1) + detail::PRIME(2); + v2 = seed + detail::PRIME(2); + v3 = seed + 0; + v4 = seed - detail::PRIME(1); + return error_code::ok; + } + + error_code update(const void* input, size_t length, endianness endian = endianness::unspecified) + { + return _update_impl(input, length, mem_ops::get_endian(endian)); + } + + template + error_code update(const std::basic_string& input, endianness endian = endianness::unspecified) + { + return _update_impl(static_cast(input.data()), input.length() * sizeof(T), mem_ops::get_endian(endian)); + } + + template + error_code update(ContiguousIterator begin, ContiguousIterator end, endianness endian = endianness::unspecified) + { + using T = typename std::decay_t; + return _update_impl(static_cast(&*begin), (end - begin) * sizeof(T), mem_ops::get_endian(endian)); + } + + template + error_code update(const std::vector& input, endianness endian = endianness::unspecified) + { + return _update_impl(static_cast(input.data()), input.size() * sizeof(T), mem_ops::get_endian(endian)); + } + + template + error_code update(const std::array& input, endianness endian = endianness::unspecified) + { + return _update_impl(static_cast(input.data()), AN * sizeof(T), mem_ops::get_endian(endian)); + } + + template + error_code update(const std::initializer_list& input, endianness endian = endianness::unspecified) + { + return _update_impl(static_cast(input.begin()), input.size() * sizeof(T), mem_ops::get_endian(endian)); + } + + hash_t digest(endianness endian = endianness::unspecified) + { + return _digest_impl(mem_ops::get_endian(endian)); + } + }; + + using hash_state32_t = hash_state_t<32>; + using hash_state64_t = hash_state_t<64>; + + + /* ******************************************************************* + * Canonical + *********************************************************************/ + + template + struct canonical_t + { + std::array digest;\ + + + + canonical_t(hash_t hash) + { + if (mem_ops::is_little_endian()) { hash = bit_ops::swap(hash); } + memcpy(digest.data(), &hash, sizeof(canonical_t)); + } + + hash_t get_hash() const + { + return mem_ops::readBE(&digest); + } + }; + + using canonical32_t = canonical_t<32>; + using canonical64_t = canonical_t<64>; +} diff --git a/rpmp/main.cc b/rpmp/main.cc new file mode 100644 index 00000000..c8fe9d16 --- /dev/null +++ b/rpmp/main.cc @@ -0,0 +1,42 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/main.cc + * Path: /mnt/spark-pmof/tool/rpmp + * Created Date: Thursday, November 7th 2019, 3:48:52 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include + +#include "pmpool/Config.h" +#include "pmpool/DataServer.h" +#include "pmpool/Base.h" +#include "pmpool/Log.h" + +/** + * @brief program entry of RPMP server + * @param argc + * @param argv + * @return int + */ +int ServerMain(int argc, char **argv) { + /// initialize Config class + std::shared_ptr config = std::make_shared(); + CHK_ERR("config init", config->init(argc, argv)); + /// initialize Log class + std::shared_ptr log = std::make_shared(config.get()); + /// initialize DataServer class + std::shared_ptr dataServer = + std::make_shared(config.get(), log.get()); + log->get_file_log()->info("start to initialize data server."); + CHK_ERR("data server init", dataServer->init()); + log->get_file_log()->info("data server initailized."); + dataServer->wait(); + return 0; +} + +int main(int argc, char **argv) { + ServerMain(argc, argv); + return 0; +} diff --git a/rpmp/pmpool/Allocator.h b/rpmp/pmpool/Allocator.h new file mode 100644 index 00000000..95cbe228 --- /dev/null +++ b/rpmp/pmpool/Allocator.h @@ -0,0 +1,51 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Allocator.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Monday, December 9th 2019, 9:06:37 am + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#ifndef PMPOOL_ALLOCATOR_H_ +#define PMPOOL_ALLOCATOR_H_ + +#include + +#include + +class Chunk; + +using std::string; + +typedef uint64_t ptr_t; + +#define TO_GLOB(addr, base, wid) \ + ((ptr_t)(addr) - (ptr_t)(base) + ((ptr_t)(wid) << 48)) +#define GET_WID(global_address) ((ptr_t)(global_address) >> 48) + +struct Addr { + uint32_t aid; + uint64_t offset; + uint64_t size; +}; + +struct DiskInfo { + DiskInfo(string& path_, uint64_t size_) : path(path_), size(size_) {} + string path; + uint64_t size; +}; + +class Allocator { + public: + virtual int init() = 0; + virtual uint64_t allocate_and_write(uint64_t buffer_size, + const char* content = nullptr) = 0; + virtual int write(uint64_t address, const char* content, uint64_t size) = 0; + virtual int release(uint64_t address) = 0; + virtual int release_all() = 0; + virtual int dump_all() = 0; + virtual uint64_t get_virtual_address(uint64_t address) = 0; + virtual Chunk* get_rma_chunk() = 0; +}; +#endif // PMPOOL_ALLOCATOR_H_ diff --git a/rpmp/pmpool/AllocatorProxy.h b/rpmp/pmpool/AllocatorProxy.h new file mode 100644 index 00000000..cddc5830 --- /dev/null +++ b/rpmp/pmpool/AllocatorProxy.h @@ -0,0 +1,152 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/AllocatorProxy.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Tuesday, December 10th 2019, 12:53:48 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#ifndef PMPOOL_ALLOCATORPROXY_H_ +#define PMPOOL_ALLOCATORPROXY_H_ + +#include +#include +#include +#include +#include + +#include "Allocator.h" +#include "Config.h" +#include "DataServer.h" +#include "Log.h" +#include "PmemAllocator.h" +#include "Base.h" + +using std::atomic; +using std::make_shared; +using std::unordered_map; +using std::string; +using std::vector; + +/** + * @brief Allocator proxy schedule faily to guarantee event to be assigned to + * different allocators. + * + */ +class AllocatorProxy { + public: + AllocatorProxy() = delete; + AllocatorProxy(Config *config, Log *log, NetworkServer *networkServer) + : config_(config), log_(log) { + vector paths = config_->get_pool_paths(); + vector sizes = config_->get_pool_sizes(); + assert(paths.size() == sizes.size()); + for (int i = 0; i < paths.size(); i++) { + DiskInfo *diskInfo = new DiskInfo(paths[i], sizes[i]); + diskInfos_.push_back(diskInfo); + allocators_.push_back( + new PmemObjAllocator(log_, diskInfo, networkServer, i)); + } + } + + ~AllocatorProxy() { + for (int i = 0; i < config_->get_pool_paths().size(); i++) { + delete allocators_[i]; + delete diskInfos_[i]; + } + allocators_.clear(); + diskInfos_.clear(); + } + + int init() { + for (int i = 0; i < diskInfos_.size(); i++) { + allocators_[i]->init(); + } + return 0; + } + + uint64_t allocate_and_write(uint64_t size, const char *content = nullptr, + int index = -1) { + uint64_t addr = 0; + if (index < 0) { + int random_index = buffer_id_++ % diskInfos_.size(); + addr = allocators_[random_index]->allocate_and_write(size, content); + } else { + addr = allocators_[index % diskInfos_.size()]->allocate_and_write( + size, content); + } + } + + int write(uint64_t address, const char *content, uint64_t size) { + uint32_t wid = GET_WID(address); + return allocators_[wid]->write(address, content, size); + } + + int release(uint64_t address) { + uint32_t wid = GET_WID(address); + return allocators_[wid]->release(address); + } + + int release_all() { + for (int i = 0; i < diskInfos_.size(); i++) { + allocators_[i]->release_all(); + } + return 0; + } + + int dump_all() { + for (int i = 0; i < diskInfos_.size(); i++) { + allocators_[i]->dump_all(); + } + return 0; + } + + uint64_t get_virtual_address(uint64_t address) { + uint32_t wid = GET_WID(address); + return allocators_[wid]->get_virtual_address(address); + } + + Chunk *get_rma_chunk(uint64_t address) { + uint32_t wid = GET_WID(address); + return allocators_[wid]->get_rma_chunk(); + } + + void cache_chunk(uint64_t key, uint64_t address, uint64_t size) { + block_meta bm = {address, size}; + cache_chunk(key, bm); + } + + void cache_chunk(uint64_t key, block_meta bm) { + if (kv_meta_map.count(key)) { + kv_meta_map[key].push_back(bm); + } else { + vector bml; + bml.push_back(bm); + kv_meta_map[key] = bml; + } + } + + vector get_cached_chunk(uint64_t key) { + if (kv_meta_map.count(key)) { + return kv_meta_map[key]; + } + return vector(); + } + + void del_chunk(uint64_t key) { + if (kv_meta_map.count(key)) { + kv_meta_map.erase(key); + } + } + + private: + Config *config_; + Log *log_; + vector allocators_; + vector diskInfos_; + atomic buffer_id_{0}; + unordered_map> kv_meta_map; +}; + +#endif // PMPOOL_ALLOCATORPROXY_H_ diff --git a/rpmp/pmpool/Base.h b/rpmp/pmpool/Base.h new file mode 100644 index 00000000..685c0d60 --- /dev/null +++ b/rpmp/pmpool/Base.h @@ -0,0 +1,54 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/fb/Encoder.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool/fb + * Created Date: Friday, December 27th 2019, 3:05:51 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#ifndef PMPOOL_BASE_H_ +#define PMPOOL_BASE_H_ + +#include +#include +#include +#include +#include + +#define CHK_ERR(function_name, result) \ + { \ + if (result) { \ + fprintf(stderr, "%s: %s\n", function_name, strerror(result)); \ + return result; \ + } \ + } + +struct RequestMsg { + uint32_t type; + uint64_t rid; + uint64_t address; + uint64_t src_address; + uint64_t src_rkey; + uint64_t size; + uint64_t key; +}; + +struct RequestReplyMsg { + uint32_t type; + uint32_t success; + uint64_t rid; + uint64_t address; + uint64_t size; + uint64_t key; +}; + +struct block_meta { + block_meta() : block_meta(0, 0) {} + block_meta(uint64_t _address, uint64_t _size) + : address(_address), size(_size) {} + uint64_t address; + uint64_t size; +}; + +#endif // PMPOOL_BASE_H_ diff --git a/rpmp/pmpool/CMakeLists.txt b/rpmp/pmpool/CMakeLists.txt new file mode 100644 index 00000000..64c82108 --- /dev/null +++ b/rpmp/pmpool/CMakeLists.txt @@ -0,0 +1,18 @@ +add_library(pmpool SHARED DataServer.cc Protocol.cc Event.cc NetworkServer.cc hash/xxhash.cc client/PmPoolClient.cc client/NetworkClient.cc client/native/com_intel_rpmp_PmPoolClient.cc) +target_link_libraries(pmpool LINK_PUBLIC ${Boost_LIBRARIES} hpnl pmemobj) +set_target_properties(pmpool PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") + +if(UNIX AND NOT APPLE) + set(LINUX TRUE) +endif() + +if(APPLE) + set(JNI_INCLUDE "$ENV{JAVA_HOME}/include" "$ENV{JAVA_HOME}/include/darwin") +endif() +if(LINUX) + set(JNI_INCLUDE "$ENV{JAVA_HOME}/include" "$ENV{JAVA_HOME}/include/linux") +endif() +include_directories(${JNI_INCLUDE}) + +set(CMAKE_INSTALL_PREFIX "/usr/local") +install(TARGETS pmpool LIBRARY DESTINATION lib ARCHIVE DESTINATION lib) diff --git a/rpmp/pmpool/Common.h b/rpmp/pmpool/Common.h new file mode 100644 index 00000000..56c6fa0b --- /dev/null +++ b/rpmp/pmpool/Common.h @@ -0,0 +1,28 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Common.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Wednesday, January 15th 2020, 7:44:44 pm + * Author: root + * + * Copyright (c) 2020 Your Company + */ + +#ifndef PMPOOL_COMMON_H_ +#define PMPOOL_COMMON_H_ + +#include + +class spin_mutex { + public: + std::atomic_flag flag = ATOMIC_FLAG_INIT; + spin_mutex() = default; + spin_mutex(const spin_mutex &) = delete; + spin_mutex &operator=(const spin_mutex &) = delete; + void lock() { + while (flag.test_and_set(std::memory_order_acquire)) { + } + } + void unlock() { flag.clear(std::memory_order_release); } +}; + +#endif // PMPOOL_COMMON_H_ diff --git a/rpmp/pmpool/Config.h b/rpmp/pmpool/Config.h new file mode 100644 index 00000000..2a1b9526 --- /dev/null +++ b/rpmp/pmpool/Config.h @@ -0,0 +1,139 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Config.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Thursday, November 7th 2019, 3:48:52 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#ifndef PMPOOL_CONFIG_H_ +#define PMPOOL_CONFIG_H_ + +#include +#include +#include +#include + +#include + +using boost::program_options::error; +using boost::program_options::options_description; +using boost::program_options::value; +using boost::program_options::variables_map; +using std::string; +using std::vector; + +/** + * @brief This class represents the current RPMP configuration. + * + */ +class Config { + public: + int init(int argc, char **argv) { + try { + options_description desc{"Options"}; + desc.add_options()("help,h", "Help screen")( + "address,a", value()->default_value("172.168.0.40"), + "set the rdma server address")( + "port,p", value()->default_value("12346"), + "set the rdma server port")("network_buffer_size,nbs", + value()->default_value(65536), + "set network buffer size")( + "network_buffer_num,nbn", value()->default_value(16), + "set network buffer number")("network_worker,nw", + value()->default_value(1), + "set network wroker number")( + "paths,ps", value>(), "set memory pool path")( + "sizes,ss", value>(), "set memory pool size")( + "log,l", value()->default_value("/tmp/rpmp.log"), + "set rpmp log file path")("log_level,ll", + value()->default_value("warn"), + "set log level"); + + variables_map vm; + store(parse_command_line(argc, argv, desc), vm); + notify(vm); + + if (vm.count("help")) { + std::cout << desc << '\n'; + return -1; + } + set_ip(vm["address"].as()); + set_port(vm["port"].as()); + set_network_buffer_size(vm["network_buffer_size"].as()); + set_network_buffer_num(vm["network_buffer_num"].as()); + set_network_worker_num(vm["network_worker"].as()); + pool_paths_.push_back("/dev/dax0.0"); + pool_paths_.push_back("/dev/dax0.1"); + pool_paths_.push_back("/dev/dax1.0"); + pool_paths_.push_back("/dev/dax1.1"); + sizes_.push_back(126833655808L); + sizes_.push_back(126833655808L); + sizes_.push_back(126833655808L); + sizes_.push_back(126833655808L); + affinities_.push_back(2); + affinities_.push_back(41); + affinities_.push_back(22); + affinities_.push_back(60); + set_log_path(vm["log"].as()); + set_log_level(vm["log_level"].as()); + } catch (const error &ex) { + std::cerr << ex.what() << '\n'; + } + return 0; + } + + string get_ip() { return ip_; } + void set_ip(string ip) { ip_ = ip; } + + string get_port() { return port_; } + void set_port(string port) { port_ = port; } + + int get_network_buffer_size() { return network_buffer_size_; } + void set_network_buffer_size(int network_buffer_size) { + network_buffer_size_ = network_buffer_size; + } + + int get_network_buffer_num() { return network_buffer_num_; } + void set_network_buffer_num(int network_buffer_num) { + network_buffer_num_ = network_buffer_num; + } + + int get_network_worker_num() { return network_worker_num_; } + void set_network_worker_num(int network_worker_num) { + network_worker_num_ = network_worker_num; + } + + vector &get_pool_paths() { return pool_paths_; } + void set_pool_paths(const vector &pool_paths) { + pool_paths_ = pool_paths; + } + + std::vector get_pool_sizes() { return sizes_; } + void set_pool_sizes(vector sizes) { sizes_ = sizes; } + + int get_pool_size() { return sizes_.size(); } + + std::vector get_affinities_() { return affinities_; } + + string get_log_path() { return log_path_; } + void set_log_path(string log_path) { log_path_ = log_path; } + + string get_log_level() { return log_level_; } + void set_log_level(string log_level) { log_level_ = log_level; } + + private: + string ip_; + string port_; + int network_buffer_size_; + int network_buffer_num_; + int network_worker_num_; + vector pool_paths_; + vector sizes_; + vector affinities_; + string log_path_; + string log_level_; +}; + +#endif // PMPOOL_CONFIG_H_ diff --git a/rpmp/pmpool/DataServer.cc b/rpmp/pmpool/DataServer.cc new file mode 100644 index 00000000..47859c50 --- /dev/null +++ b/rpmp/pmpool/DataServer.cc @@ -0,0 +1,42 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/DataServer.cc + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Thursday, November 7th 2019, 3:48:52 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include "pmpool/DataServer.h" + +#include "AllocatorProxy.h" +#include "Config.h" +#include "Digest.h" +#include "NetworkServer.h" +#include "Protocol.h" +#include "Log.h" + +DataServer::DataServer(Config *config, Log *log) : config_(config), log_(log) {} + +int DataServer::init() { + networkServer_ = std::make_shared(config_, log_); + CHK_ERR("network server init", networkServer_->init()); + log_->get_file_log()->info("network server initialized."); + + allocatorProxy_ = + std::make_shared(config_, log_, networkServer_.get()); + CHK_ERR("allocator proxy init", allocatorProxy_->init()); + log_->get_file_log()->info("allocator proxy initialized."); + + protocol_ = std::make_shared(config_, log_, networkServer_.get(), + allocatorProxy_.get()); + CHK_ERR("protocol init", protocol_->init()); + log_->get_file_log()->info("protocol initialized."); + + networkServer_->start(); + log_->get_file_log()->info("network server started."); + log_->get_console_log()->info("RPMP started..."); + return 0; +} + +void DataServer::wait() { networkServer_->wait(); } diff --git a/rpmp/pmpool/DataServer.h b/rpmp/pmpool/DataServer.h new file mode 100644 index 00000000..14e70ec1 --- /dev/null +++ b/rpmp/pmpool/DataServer.h @@ -0,0 +1,45 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/DataServer.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Thursday, November 7th 2019, 3:48:52 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#ifndef PMPOOL_DATASERVER_H_ +#define PMPOOL_DATASERVER_H_ + +#include +#include + +#include + +class Config; +class Protocol; +class Digest; +class DataList; +class AllocatorProxy; +class NetworkServer; +class Log; + +/** + * @brief DataServer is designed as distributed remote memory pool. + * DataServer on every node communicated with each other to guarantee data consistency. + * + */ +class DataServer { + public: + DataServer() = delete; + explicit DataServer(Config* config, Log* log); + int init(); + void wait(); + private: + Config* config_; + Log* log_; + std::shared_ptr networkServer_; + std::shared_ptr allocatorProxy_; + std::shared_ptr protocol_; +}; + +#endif // PMPOOL_DATASERVER_H_ diff --git a/rpmp/pmpool/Digest.h b/rpmp/pmpool/Digest.h new file mode 100644 index 00000000..cd7b5c7f --- /dev/null +++ b/rpmp/pmpool/Digest.h @@ -0,0 +1,30 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Digest.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Thursday, November 7th 2019, 3:48:52 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#ifndef PMPOOL_DIGEST_H_ +#define PMPOOL_DIGEST_H_ + +#include +#include + +#include +#include "xxhash/xxhash.h" +#include "xxhash/xxhash.hpp" + +using std::string; + +class Digest { + public: + Digest() = default; + static void computeKeyHash(const string &key, uint64_t *hash) { + *hash = xxh::xxhash<64>(key); + } +}; + +#endif // PMPOOL_DIGEST_H_ diff --git a/rpmp/pmpool/Event.cc b/rpmp/pmpool/Event.cc new file mode 100644 index 00000000..2415c58d --- /dev/null +++ b/rpmp/pmpool/Event.cc @@ -0,0 +1,116 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Request.cc + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Thursday, December 12th 2019, 1:36:18 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include "pmpool/Event.h" + +#include "pmpool/buffer/CircularBuffer.h" + +Request::Request(RequestContext requestContext) + : data_(nullptr), size_(0), requestContext_(requestContext) {} + +Request::Request(char *data, uint64_t size, Connection *con) : size_(size) { + data_ = static_cast(std::malloc(size)); + memcpy(data_, data, size_); + requestContext_.con = con; +} + +Request::~Request() { + if (data_ != nullptr) { + std::free(data_); + data_ = nullptr; + } +} + +RequestContext &Request::get_rc() { return requestContext_; } + +void Request::encode() { + OpType rt = requestContext_.type; + assert(rt == ALLOC || rt == FREE || rt == WRITE || rt == READ); + requestMsg_.type = requestContext_.type; + requestMsg_.rid = requestContext_.rid; + requestMsg_.address = requestContext_.address; + requestMsg_.src_address = requestContext_.src_address; + requestMsg_.src_rkey = requestContext_.src_rkey; + requestMsg_.size = requestContext_.size; + requestMsg_.key = requestContext_.key; + + size_ = sizeof(requestMsg_); + data_ = static_cast(std::malloc(size_)); + memcpy(data_, &requestMsg_, size_); +} + +void Request::decode() { + assert(size_ == sizeof(requestMsg_)); + memcpy(&requestMsg_, data_, size_); + requestContext_.type = (OpType)requestMsg_.type; + requestContext_.rid = requestMsg_.rid; + requestContext_.address = requestMsg_.address; + requestContext_.src_address = requestMsg_.src_address; + requestContext_.src_rkey = requestMsg_.src_rkey; + requestContext_.size = requestMsg_.size; + requestContext_.key = requestMsg_.key; +} + +RequestReply::RequestReply(RequestReplyContext requestReplyContext) + : data_(nullptr), size_(0), requestReplyContext_(requestReplyContext) {} + +RequestReply::RequestReply(char *data, uint64_t size, Connection *con) + : size_(size) { + data_ = static_cast(std::malloc(size_)); + memcpy(data_, data, size_); + requestReplyContext_.con = con; +} + +RequestReply::~RequestReply() { + if (data_ != nullptr) { + std::free(data_); + data_ = nullptr; + } +} + +RequestReplyContext &RequestReply::get_rrc() { return requestReplyContext_; } + +void RequestReply::encode() { + requestReplyMsg_.type = (OpType)requestReplyContext_.type; + requestReplyMsg_.success = requestReplyContext_.success; + requestReplyMsg_.rid = requestReplyContext_.rid; + requestReplyMsg_.address = requestReplyContext_.address; + requestReplyMsg_.size = requestReplyContext_.size; + requestReplyMsg_.key = requestReplyContext_.key; + auto msg_size = sizeof(requestReplyMsg_); + size_ = msg_size; + + /// copy data from block metadata list + uint32_t bml_size = 0; + if (!requestReplyContext_.bml.empty()) { + bml_size = sizeof(block_meta) * requestReplyContext_.bml.size(); + size_ += bml_size; + } + data_ = static_cast(std::malloc(size_)); + memcpy(data_, &requestReplyMsg_, msg_size); + if (bml_size != 0) { + memcpy(data_ + msg_size, &requestReplyContext_.bml[0], bml_size); + } +} + +void RequestReply::decode() { + memcpy(&requestReplyMsg_, data_, size_); + requestReplyContext_.type = (OpType)requestReplyMsg_.type; + requestReplyContext_.success = requestReplyMsg_.success; + requestReplyContext_.rid = requestReplyMsg_.rid; + requestReplyContext_.address = requestReplyMsg_.address; + requestReplyContext_.size = requestReplyMsg_.size; + requestReplyContext_.key = requestReplyMsg_.key; + if (size_ > sizeof(requestReplyMsg_)) { + auto bml_size = size_ - sizeof(requestReplyMsg_); + requestReplyContext_.bml.resize(bml_size / sizeof(block_meta)); + memcpy(&requestReplyContext_.bml[0], data_ + sizeof(requestReplyMsg_), + bml_size); + } +} diff --git a/rpmp/pmpool/Event.h b/rpmp/pmpool/Event.h new file mode 100644 index 00000000..136b73f3 --- /dev/null +++ b/rpmp/pmpool/Event.h @@ -0,0 +1,139 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Request.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Friday, December 13th 2019, 3:43:30 pm + * Author: root + * + * Copyright (c) Intel + */ + +#ifndef PMPOOL_EVENT_H_ +#define PMPOOL_EVENT_H_ + +#include +#include + +#include // NOLINT +#include + +#include "pmpool/Base.h" +#include "pmpool/PmemAllocator.h" + +using std::future; +using std::promise; +using std::vector; + +class RequestHandler; +class ClientRecvCallback; +class Protocol; + +enum OpType : uint32_t { + ALLOC = 1, + FREE, + PREPARE, + WRITE, + READ, + PUT, + GET, + GET_META, + DELETE, + REPLY = 1 << 16, + ALLOC_REPLY, + FREE_REPLY, + PREPARE_REPLY, + WRITE_REPLY, + READ_REPLY, + PUT_REPLY, + GET_REPLY, + GET_META_REPLY, + DELETE_REPLY +}; + +/** + * @brief Define two types of event in this file: Request, RequestReply + * Request: a event that client creates and sends to server. + * RequestReply: a event that server creates and sends to client. + * RequestContext and RequestReplyContext include the context information of the + * previous two events. + */ +struct RequestReplyContext { + OpType type; + uint32_t success; + uint64_t rid; + uint64_t address; + uint64_t src_address; + uint64_t dest_address; + uint64_t src_rkey; + uint64_t size; + uint64_t key; + Connection* con; + Chunk* ck; + vector bml; +}; + +template +inline void encode_(T* t, char* data, uint64_t* size) { + assert(t != nullptr); + memcpy(data, t, sizeof(t)); + *size = sizeof(t); +} + +template +inline void decode_(T* t, char* data, uint64_t size) { + assert(t != nullptr); + assert(size == sizeof(t)); + memcpy(t, data, size); +} + +class RequestReply { + public: + RequestReply() = delete; + explicit RequestReply(RequestReplyContext requestReplyContext); + RequestReply(char* data, uint64_t size, Connection* con); + ~RequestReply(); + RequestReplyContext& get_rrc(); + void decode(); + void encode(); + + private: + friend Protocol; + char* data_; + uint64_t size_; + RequestReplyMsg requestReplyMsg_; + RequestReplyContext requestReplyContext_; +}; + +typedef promise Promise; +typedef future Future; + +struct RequestContext { + OpType type; + uint64_t rid; + uint64_t address; + uint64_t src_address; + uint64_t src_rkey; + uint64_t size; + uint64_t key; + Connection* con; +}; + +class Request { + public: + Request() = delete; + explicit Request(RequestContext requestContext); + Request(char* data, uint64_t size, Connection* con); + ~Request(); + RequestContext& get_rc(); + void encode(); + void decode(); + + private: + friend RequestHandler; + friend ClientRecvCallback; + char* data_; + uint64_t size_; + RequestMsg requestMsg_; + RequestContext requestContext_; +}; + +#endif // PMPOOL_EVENT_H_ diff --git a/rpmp/pmpool/Log.h b/rpmp/pmpool/Log.h new file mode 100644 index 00000000..e396e188 --- /dev/null +++ b/rpmp/pmpool/Log.h @@ -0,0 +1,49 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Log.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Friday, February 28th 2020, 2:37:41 pm + * Author: root + * + * Copyright (c) 2020 Intel + */ + +#ifndef PMPOOL_LOG_H_ +#define PMPOOL_LOG_H_ + +#include + +#include "Config.h" +#include "spdlog/spdlog.h" + +class Log { + public: + explicit Log(Config *config) : config_(config) { + file_log_ = spdlog::basic_logger_mt("file_logger", config_->get_log_path()); + if (config_->get_log_level() == "debug") { + file_log_->set_level(spdlog::level::debug); + file_log_->flush_on(spdlog::level::debug); + } else if (config_->get_log_level() == "info") { + file_log_->set_level(spdlog::level::info); + file_log_->flush_on(spdlog::level::info); + } else if (config_->get_log_level() == "warn") { + file_log_->set_level(spdlog::level::warn); + file_log_->flush_on(spdlog::level::warn); + } else if (config_->get_log_level() == "error") { + file_log_->set_level(spdlog::level::err); + file_log_->flush_on(spdlog::level::err); + } else { + } + console_log_ = spdlog::stdout_color_mt("console"); + console_log_->flush_on(spdlog::level::info); + } + + std::shared_ptr get_file_log() { return file_log_; } + std::shared_ptr get_console_log() { return console_log_; } + + private: + Config *config_; + std::shared_ptr file_log_; + std::shared_ptr console_log_; +}; + +#endif // PMPOOL_LOG_H_ diff --git a/rpmp/pmpool/NetworkServer.cc b/rpmp/pmpool/NetworkServer.cc new file mode 100644 index 00000000..2f731a79 --- /dev/null +++ b/rpmp/pmpool/NetworkServer.cc @@ -0,0 +1,134 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/NetworkServer.cc + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Tuesday, December 24th 2019, 7:29:48 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include "pmpool/NetworkServer.h" + +#include "Base.h" +#include "Config.h" +#include "Event.h" +#include "Log.h" +#include "buffer/CircularBuffer.h" + +NetworkServer::NetworkServer(Config *config, Log *log) + : config_(config), log_(log) { + time = 0; +} + +NetworkServer::~NetworkServer() { + for (int i = 0; i < buffer_id_; i++) { + unregister_rma_buffer(i); + } +} + +int NetworkServer::init() { + server_ = std::make_shared(config_->get_network_worker_num(), + config_->get_network_buffer_num()); + CHK_ERR("hpnl server init", server_->init()); + + chunkMgr_ = std::make_shared(server_.get(), + config_->get_network_buffer_size(), + config_->get_network_buffer_num()); + + server_->set_chunk_mgr(chunkMgr_.get()); + return 0; +} + +int NetworkServer::start() { + server_->start(); + CHK_ERR("hpnl server listen", server_->listen(config_->get_ip().c_str(), + config_->get_port().c_str())); + + circularBuffer_ = + std::make_shared(1024 * 1024, 4096, true, this); + return 0; +} + +void NetworkServer::wait() { server_->wait(); } + +Chunk *NetworkServer::register_rma_buffer(char *rma_buffer, uint64_t size) { + return server_->reg_rma_buffer(rma_buffer, size, buffer_id_++); +} + +void NetworkServer::unregister_rma_buffer(int buffer_id) { + server_->unreg_rma_buffer(buffer_id); +} + +void NetworkServer::get_dram_buffer(RequestReplyContext *rrc) { + char *buffer = circularBuffer_->get(rrc->size); + rrc->dest_address = (uint64_t)buffer; + + Chunk *base_ck = circularBuffer_->get_rma_chunk(); + uint64_t offset = circularBuffer_->get_offset(rrc->dest_address); + + // encapsulate new chunk + Chunk *ck = new Chunk(); + ck->buffer = static_cast(base_ck->buffer) + offset; + ck->capacity = base_ck->capacity; + ck->buffer_id = buffer_id_++; + ck->mr = base_ck->mr; + ck->size = rrc->size; + rrc->ck = ck; +} + +void NetworkServer::reclaim_dram_buffer(RequestReplyContext *rrc) { + char *buffer_tmp = reinterpret_cast(rrc->dest_address); + circularBuffer_->put(buffer_tmp, rrc->size); + delete rrc->ck; +} + +void NetworkServer::get_pmem_buffer(RequestReplyContext *rrc, Chunk *base_ck) { + Chunk *ck = new Chunk(); + ck->buffer = reinterpret_cast(rrc->dest_address); + ck->capacity = rrc->size; + ck->buffer_id = buffer_id_++; + ck->mr = base_ck->mr; + ck->size = rrc->size; + rrc->ck = ck; +} + +void NetworkServer::reclaim_pmem_buffer(RequestReplyContext *rrc) { + if (rrc->ck != nullptr) { + delete rrc->ck; + } +} + +ChunkMgr *NetworkServer::get_chunk_mgr() { return chunkMgr_.get(); } + +void NetworkServer::set_recv_callback(Callback *callback) { + server_->set_recv_callback(callback); +} + +void NetworkServer::set_send_callback(Callback *callback) { + server_->set_send_callback(callback); +} + +void NetworkServer::set_read_callback(Callback *callback) { + server_->set_read_callback(callback); +} + +void NetworkServer::set_write_callback(Callback *callback) { + server_->set_write_callback(callback); +} + +void NetworkServer::send(char *data, uint64_t size, Connection *con) { + auto ck = chunkMgr_->get(con); + std::memcpy(reinterpret_cast(ck->buffer), data, size); + ck->size = size; + con->send(ck); +} + +void NetworkServer::read(RequestReply *rr) { + RequestReplyContext rrc = rr->get_rrc(); + rrc.con->read(rrc.ck, 0, rrc.size, rrc.src_address, rrc.src_rkey); +} + +void NetworkServer::write(RequestReply *rr) { + RequestReplyContext rrc = rr->get_rrc(); + rrc.con->write(rrc.ck, 0, rrc.size, rrc.src_address, rrc.src_rkey); +} diff --git a/rpmp/pmpool/NetworkServer.h b/rpmp/pmpool/NetworkServer.h new file mode 100644 index 00000000..d125a83c --- /dev/null +++ b/rpmp/pmpool/NetworkServer.h @@ -0,0 +1,85 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/NetworkServer.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Tuesday, December 10th 2019, 3:14:59 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#ifndef PMPOOL_NETWORKSERVER_H_ +#define PMPOOL_NETWORKSERVER_H_ + +#include +#include +#include + +#include +#include + +#include "RmaBufferRegister.h" + +class CircularBuffer; +class Config; +class RequestReply; +class RequestReplyContext; +class Log; + +/** + * @brief RPMP network service is based on HPNL, which is a completely + * asynchronous network library. RPMP currently supports RDMA iWarp and RoCE V2 + * protocol. + */ +class NetworkServer : public RmaBufferRegister { + public: + NetworkServer() = delete; + NetworkServer(Config *config, Log *log_); + ~NetworkServer(); + int init(); + int start(); + void wait(); + /// register DRAM or Persistent Memory as RDMA region. + /// Return chunk that is the wrapper of RDMA region if succeed, + /// return nullptr if fail. + Chunk *register_rma_buffer(char *rma_buffer, uint64_t size) override; + + /// unregister RDMA region for given buffer. + void unregister_rma_buffer(int buffer_id) override; + + /// get DRAM buffer from circular buffer pool. + void get_dram_buffer(RequestReplyContext *rrc); + + /// reclaim DRAM buffer from circular buffer pool. + void reclaim_dram_buffer(RequestReplyContext *rrc); + + /// get Persistent Memory buffer from circular buffer pool + void get_pmem_buffer(RequestReplyContext *rrc, Chunk *ck); + + /// reclaim Persistent Memory buffer form circular buffer pool + void reclaim_pmem_buffer(RequestReplyContext *rrc); + + /// return the pointer of chunk manager. + ChunkMgr *get_chunk_mgr(); + + /// since the network implementation is asynchronous, + /// we need to define callback better before starting network service. + void set_recv_callback(Callback *callback); + void set_send_callback(Callback *callback); + void set_read_callback(Callback *callback); + void set_write_callback(Callback *callback); + + void send(char *data, uint64_t size, Connection *con); + void read(RequestReply *rrc); + void write(RequestReply *rrc); + + private: + Config *config_; + Log* log_; + std::shared_ptr server_; + std::shared_ptr chunkMgr_; + std::shared_ptr circularBuffer_; + std::atomic buffer_id_{0}; + uint64_t time; +}; + +#endif // PMPOOL_NETWORKSERVER_H_ diff --git a/rpmp/pmpool/PmemAllocator.h b/rpmp/pmpool/PmemAllocator.h new file mode 100644 index 00000000..b6d279ce --- /dev/null +++ b/rpmp/pmpool/PmemAllocator.h @@ -0,0 +1,392 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/PmemObjAllocator.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Monday, December 9th 2019, 10:52:02 am + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#ifndef PMPOOL_PMEMALLOCATOR_H_ +#define PMPOOL_PMEMALLOCATOR_H_ + +#include + +#include +#include // NOLINT +#include +#include +#include +#include +#include + +#include "Allocator.h" +#include "DataServer.h" +#include "Log.h" +#include "NetworkServer.h" + +using std::shared_ptr; +using std::unordered_map; + +#define PMEMOBJ_ALLOCATOR_LAYOUT_NAME "pmemobj_allocator_layout" + +// block header stored in pmem +struct block_hdr { + PMEMoid next; + PMEMoid pre; + uint64_t addr; + uint64_t size; +}; + +// block data entry stored in pmem +struct block_entry { + struct block_hdr hdr; + PMEMoid data; +}; + +// pmem root entry +struct Base { + PMEMoid head; + PMEMoid tail; + PMEMrwlock rwlock; + uint64_t bytes_written; +}; + +struct PmemContext { + PMEMobjpool *pop; + PMEMoid poid; + Base *base; +}; + +// pmem data allocation types +enum types { BLOCK_ENTRY_TYPE, DATA_TYPE, MAX_TYPE }; + +/** + * @brief libpmemobj based implementation of Allocator interface. + * + */ +class PmemObjAllocator : public Allocator { + public: + PmemObjAllocator() = delete; + explicit PmemObjAllocator(Log *log, DiskInfo *diskInfos, + NetworkServer *server, int wid) + : log_(log), diskInfo_(diskInfos), server_(server), wid_(wid) {} + ~PmemObjAllocator() { close(); } + + int init() override { + memset(str, '0', 1048576); + if (create()) { + int res = open(); + if (res) { + string err_msg = pmemobj_errormsg(); + log_->get_file_log()->error("failed to open pmem pool, errmsg: " + + err_msg); + } + } + return 0; + } + + uint64_t allocate_and_write(uint64_t size, + const char *content = nullptr) override { + jmp_buf env; + if (setjmp(env)) { + // end the transaction + (void)pmemobj_tx_end(); + return -1; + } + + // begin a transaction, also acquiring the write lock for the data + if (pmemobj_tx_begin(pmemContext_.pop, env, TX_PARAM_RWLOCK, + &pmemContext_.base->rwlock, TX_PARAM_NONE)) { + perror("pmemobj_tx_begin failed in pmemkv put"); + return -1; + } + + // allocate the new node to be inserted + PMEMoid beo = + pmemobj_tx_alloc(sizeof(struct block_entry), BLOCK_ENTRY_TYPE); + if (beo.off == 0 && beo.pool_uuid_lo == 0) { + (void)pmemobj_tx_end(); + perror("pmemobj_tx_alloc failed in pmemkv put"); + return -1; + } + struct block_entry *bep = (struct block_entry *)pmemobj_direct(beo); + bep->data = pmemobj_tx_zalloc(size, DATA_TYPE); + bep->hdr.next = OID_NULL; + bep->hdr.addr = TO_GLOB((uint64_t)pmemobj_direct(bep->data), + (uint64_t)pmemContext_.pop, wid_); + bep->hdr.size = size; + + uint64_t start = + std::chrono::high_resolution_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); + char *pmem_data = static_cast(pmemobj_direct(bep->data)); + if (content != nullptr) { + memcpy(pmem_data, content, size); + } + uint64_t end = + std::chrono::high_resolution_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); + total += (end - start); + // std::cout << "index " << wid_ << ", total is " << total / 1000.0 + // << std::endl; + + // add the modified root object to the undo data + pmemobj_tx_add_range(pmemContext_.poid, 0, sizeof(struct Base)); + if (pmemContext_.base->tail.off == 0) { + // update head + pmemContext_.base->head = beo; + bep->hdr.pre = OID_NULL; + } else { + // add the modified tail entry to the undo data + bep->hdr.pre = pmemContext_.base->tail; + pmemobj_tx_add_range(pmemContext_.base->tail, 0, + sizeof(struct block_entry)); + ((struct block_entry *)pmemobj_direct(pmemContext_.base->tail)) + ->hdr.next = beo; + } + + pmemContext_.base->tail = beo; // update tail + pmemContext_.base->bytes_written += size; + pmemobj_tx_commit(); + (void)pmemobj_tx_end(); + + // update in-memory index + if (update_meta(beo)) { + return -1; + } + + return bep->hdr.addr; + } + + int write(uint64_t address, const char *content, uint64_t size) override { + std::unique_lock l(mtx); + if (!index_map.count(address)) { + return -1; + } + PMEMoid data = index_map[address]; + struct block_entry *bep = (struct block_entry *)pmemobj_direct(data); + char *pmem_data = static_cast(pmemobj_direct(bep->data)); + // pmemobj_memcpy_persist(pmemContext_.pop, pmem_data, content, size); + uint64_t start = + std::chrono::high_resolution_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); + memcpy(pmem_data, content, size); + uint64_t end = + std::chrono::high_resolution_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); + total += (end - start); + l.unlock(); + return 0; + } + + uint64_t get_virtual_address(uint64_t address) { + std::unique_lock l(mtx); + if (!index_map.count(address)) { + return -1; + } + PMEMoid data = index_map[address]; + struct block_entry *bep = (struct block_entry *)pmemobj_direct(data); + char *pmem_data = static_cast(pmemobj_direct(bep->data)); + l.unlock(); + return (uint64_t)pmem_data; + } + + int release(uint64_t address) override { + jmp_buf env; + if (setjmp(env)) { + // end the transaction + (void)pmemobj_tx_end(); + return -1; + } + + // begin a transaction, also acquiring the write lock for the data + if (pmemobj_tx_begin(pmemContext_.pop, env, TX_PARAM_RWLOCK, + &pmemContext_.base->rwlock, TX_PARAM_NONE)) { + perror("pmemobj_tx_begin failed in pmemkv put"); + return -1; + } + if (!index_map.count(address)) { + (void)pmemobj_tx_end(); + perror("address not found"); + return -1; + } + PMEMoid data = index_map[address]; + struct block_entry *bep = (struct block_entry *)pmemobj_direct(data); + struct block_entry *prev_bep = + (struct block_entry *)pmemobj_direct(bep->hdr.pre); + struct block_entry *next_bep = + (struct block_entry *)pmemobj_direct(bep->hdr.next); + pmemobj_tx_add_range(pmemContext_.poid, 0, sizeof(struct Base)); + if (prev_bep == nullptr) { + if (next_bep == nullptr) { + pmemContext_.base->head = OID_NULL; + pmemContext_.base->tail = OID_NULL; + } else { + pmemContext_.base->head = bep->hdr.next; + next_bep->hdr.pre = OID_NULL; + } + } else { + pmemobj_tx_add_range(bep->hdr.pre, 0, sizeof(struct Base)); + prev_bep->hdr.next = bep->hdr.next; + } + pmemContext_.base->bytes_written -= bep->hdr.size; + pmemobj_tx_add_range(data, 0, sizeof(struct Base)); + bep->hdr.pre = OID_NULL; + bep->hdr.next = OID_NULL; + pmemobj_free(&data); + pmemobj_free(&bep->data); + + pmemobj_tx_commit(); + (void)pmemobj_tx_end(); + + return 0; + } + + int release_all() override { + PMEMoid cur_oid = pmemContext_.base->head; + while (cur_oid.off != 0 && cur_oid.pool_uuid_lo != 0) { + struct block_entry *cur_bep = + (struct block_entry *)pmemobj_direct(cur_oid); + PMEMoid next_oid = cur_bep->hdr.next; + struct block_entry *next_bep = + (struct block_entry *)pmemobj_direct(next_oid); + pmemobj_free(&cur_oid); + pmemobj_free(&cur_bep->data); + cur_oid = next_oid; + } + pmemContext_.base->head = OID_NULL; + pmemContext_.base->tail = OID_NULL; + pmemContext_.base->bytes_written = 0; + + return 0; + } + + int dump_all() override { + std::cout << "******************worker " << wid_ + << " start dump*********************" << std::endl; + if (pmemobj_rwlock_rdlock(pmemContext_.pop, &pmemContext_.base->rwlock) != + 0) { + return -1; + } + struct block_entry *next_bep = + (struct block_entry *)pmemobj_direct(pmemContext_.base->head); + uint64_t read_offset = 0; + while (next_bep != nullptr) { + char *pmem_data = + reinterpret_cast(pmemobj_direct(next_bep->data)); + char *tmp = reinterpret_cast(std::malloc(next_bep->hdr.size)); + memcpy(tmp, pmem_data, next_bep->hdr.size); + std::cout << "dump address " << next_bep->hdr.addr << std::endl; + read_offset += next_bep->hdr.size; + std::free(tmp); + next_bep = (struct block_entry *)pmemobj_direct(next_bep->hdr.next); + } + pmemobj_rwlock_unlock(pmemContext_.pop, &pmemContext_.base->rwlock); + std::cout << "total size " << pmemContext_.base->bytes_written << std::endl; + std::cout << "******************worker " << wid_ + << " end dump*********************" << std::endl; + return 0; + } + + Chunk *get_rma_chunk() { return base_ck; } + + private: + int create() { + // debug setting + int sds_write_value = 0; + pmemobj_ctl_set(nullptr, "sds.at_create", &sds_write_value); + + pmemContext_.pop = pmemobj_create(diskInfo_->path.c_str(), + PMEMOBJ_ALLOCATOR_LAYOUT_NAME, 0, 0666); + if (pmemContext_.pop == nullptr) { + string err_msg = pmemobj_errormsg(); + log_->get_file_log()->warn("failed to create pmem pool, errmsg: " + + err_msg); + return -1; + } + pmemContext_.poid = pmemobj_root(pmemContext_.pop, sizeof(struct Base)); + pmemContext_.base = (struct Base *)pmemobj_direct(pmemContext_.poid); + pmemContext_.base->head = OID_NULL; + pmemContext_.base->tail = OID_NULL; + pmemContext_.base->bytes_written = 0; + + if (server_) { + base_ck = server_->register_rma_buffer( + reinterpret_cast(pmemContext_.pop), diskInfo_->size); + assert(base_ck != nullptr); + log_->get_console_log()->info( + "successfully registered Persistent Memory(" + diskInfo_->path + + ") as RDMA region"); + } + return 0; + } + + int open() { + // debug setting + int sds_write_value = 0; + pmemobj_ctl_set(nullptr, "sds.at_create", &sds_write_value); + + pmemContext_.pop = + pmemobj_open(diskInfo_->path.c_str(), PMEMOBJ_ALLOCATOR_LAYOUT_NAME); + if (pmemContext_.pop == nullptr) { + return -1; + } + + if (server_) { + base_ck = server_->register_rma_buffer( + reinterpret_cast(pmemContext_.pop), diskInfo_->size); + assert(base_ck != nullptr); + log_->get_console_log()->info( + "successfully registered Persistent Memory(" + diskInfo_->path + + ") as RDMA region"); + } + + pmemContext_.poid = pmemobj_root(pmemContext_.pop, sizeof(struct Base)); + pmemContext_.base = (struct Base *)pmemobj_direct(pmemContext_.poid); + PMEMoid next = pmemContext_.base->head; + while (next.off != 0 && next.pool_uuid_lo != 0) { + if (update_meta(next)) { + return -1; + } + struct block_entry *bep = (struct block_entry *)pmemobj_direct(next); + next = bep->hdr.next; + } + return 0; + } + + void close() { + pmemobj_close(pmemContext_.pop); + free_meta(); + } + + int update_meta(const PMEMoid &oid) { + std::lock_guard l(mtx); + struct block_entry *bep = (struct block_entry *)pmemobj_direct(oid); + if (!index_map.count(bep->hdr.addr)) { + index_map[bep->hdr.addr] = oid; + } else { + assert("invalide operation."); + } + return 0; + } + + int free_meta() { + std::lock_guard l(mtx); + index_map.clear(); + } + + private: + Log *log_; + DiskInfo *diskInfo_; + NetworkServer *server_; + int wid_; + PmemContext pmemContext_; + std::mutex mtx; + unordered_map index_map; + uint64_t total = 0; + char str[1048576]; + Chunk *base_ck; +}; + +#endif // PMPOOL_PMEMALLOCATOR_H_ diff --git a/rpmp/pmpool/Protocol.cc b/rpmp/pmpool/Protocol.cc new file mode 100644 index 00000000..cfa88f81 --- /dev/null +++ b/rpmp/pmpool/Protocol.cc @@ -0,0 +1,378 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Protocol.cc + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Thursday, November 7th 2019, 3:48:52 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include "pmpool/Protocol.h" + +#include + +#include "AllocatorProxy.h" +#include "Config.h" +#include "Digest.h" +#include "Event.h" +#include "Log.h" +#include "NetworkServer.h" + +RecvCallback::RecvCallback(Protocol *protocol, ChunkMgr *chunkMgr) + : protocol_(protocol), chunkMgr_(chunkMgr) {} + +void RecvCallback::operator()(void *buffer_id, void *buffer_size) { + auto buffer_id_ = *static_cast(buffer_id); + Chunk *ck = chunkMgr_->get(buffer_id_); + assert(*static_cast(buffer_size) == ck->size); + Request *request = new Request(reinterpret_cast(ck->buffer), ck->size, + reinterpret_cast(ck->con)); + request->decode(); + protocol_->enqueue_recv_msg(request); + chunkMgr_->reclaim(ck, static_cast(ck->con)); +} + +ReadCallback::ReadCallback(Protocol *protocol) : protocol_(protocol) {} + +void ReadCallback::operator()(void *buffer_id, void *buffer_size) { + auto buffer_id_ = *static_cast(buffer_id); + protocol_->enqueue_rma_msg(buffer_id_); +} + +SendCallback::SendCallback(ChunkMgr *chunkMgr) : chunkMgr_(chunkMgr) {} + +void SendCallback::operator()(void *buffer_id, void *buffer_size) { + auto buffer_id_ = *static_cast(buffer_id); + auto ck = chunkMgr_->get(buffer_id_); + + /// free the memory of class RequestReply + auto reqeustReply = static_cast(ck->ptr); + delete reqeustReply; + + chunkMgr_->reclaim(ck, static_cast(ck->con)); +} + +WriteCallback::WriteCallback(Protocol *protocol) : protocol_(protocol) {} + +void WriteCallback::operator()(void *buffer_id, void *buffer_size) { + auto buffer_id_ = *static_cast(buffer_id); + protocol_->enqueue_rma_msg(buffer_id_); +} + +RecvWorker::RecvWorker(Protocol *protocol, int index) + : protocol_(protocol), index_(index) { + init = false; +} + +int RecvWorker::entry() { + if (!init) { + set_affinity(index_); + init = true; + } + Request *request; + bool res = pendingRecvRequestQueue_.wait_dequeue_timed( + request, std::chrono::milliseconds(1000)); + if (res) { + protocol_->handle_recv_msg(request); + } + return 0; +} + +void RecvWorker::abort() {} + +void RecvWorker::addTask(Request *request) { + pendingRecvRequestQueue_.enqueue(request); +} + +ReadWorker::ReadWorker(Protocol *protocol, int index) + : protocol_(protocol), index_(index) { + init = false; +} + +int ReadWorker::entry() { + if (!init) { + set_affinity(index_); + init = true; + } + RequestReply *requestReply; + bool res = pendingReadRequestQueue_.wait_dequeue_timed( + requestReply, std::chrono::milliseconds(1000)); + if (res) { + protocol_->handle_rma_msg(requestReply); + } + return 0; +} + +void ReadWorker::abort() {} + +void ReadWorker::addTask(RequestReply *rr) { + pendingReadRequestQueue_.enqueue(rr); +} + +FinalizeWorker::FinalizeWorker(Protocol *protocol) : protocol_(protocol) {} + +int FinalizeWorker::entry() { + RequestReply *requestReply; + bool res = pendingRequestReplyQueue_.wait_dequeue_timed( + requestReply, std::chrono::milliseconds(1000)); + if (res) { + protocol_->handle_finalize_msg(requestReply); + } + return 0; +} + +void FinalizeWorker::abort() {} + +void FinalizeWorker::addTask(RequestReply *requestReply) { + pendingRequestReplyQueue_.enqueue(requestReply); +} + +Protocol::Protocol(Config *config, Log *log, NetworkServer *server, + AllocatorProxy *allocatorProxy) + : config_(config), + log_(log), + networkServer_(server), + allocatorProxy_(allocatorProxy) { + time = 0; +} + +Protocol::~Protocol() { + for (auto worker : recvWorkers_) { + worker->stop(); + worker->join(); + } + for (auto worker : readWorkers_) { + worker->stop(); + worker->join(); + } + finalizeWorker_->stop(); + finalizeWorker_->join(); +} + +int Protocol::init() { + recvCallback_ = + std::make_shared(this, networkServer_->get_chunk_mgr()); + sendCallback_ = + std::make_shared(networkServer_->get_chunk_mgr()); + readCallback_ = std::make_shared(this); + writeCallback_ = std::make_shared(this); + + for (int i = 0; i < config_->get_pool_size(); i++) { + auto recvWorker = new RecvWorker(this, config_->get_affinities_()[i] - 1); + recvWorker->start(); + recvWorkers_.push_back(std::shared_ptr(recvWorker)); + } + + finalizeWorker_ = make_shared(this); + finalizeWorker_->start(); + + for (int i = 0; i < config_->get_pool_size(); i++) { + auto readWorker = new ReadWorker(this, config_->get_affinities_()[i]); + readWorker->start(); + readWorkers_.push_back(std::shared_ptr(readWorker)); + } + + networkServer_->set_recv_callback(recvCallback_.get()); + networkServer_->set_send_callback(sendCallback_.get()); + networkServer_->set_read_callback(readCallback_.get()); + networkServer_->set_write_callback(writeCallback_.get()); + return 0; +} + +void Protocol::enqueue_recv_msg(Request *request) { + RequestContext rc = request->get_rc(); + if (rc.address != 0) { + auto wid = GET_WID(rc.address); + recvWorkers_[wid]->addTask(request); + } else { + recvWorkers_[rc.rid % config_->get_pool_size()]->addTask(request); + } +} + +void Protocol::handle_recv_msg(Request *request) { + RequestContext rc = request->get_rc(); + RequestReplyContext rrc; + switch (rc.type) { + case ALLOC: { + uint64_t addr = allocatorProxy_->allocate_and_write( + rc.size, nullptr, rc.rid % config_->get_pool_size()); + auto wid = GET_WID(addr); + assert(wid == rc.rid % config_->get_pool_size()); + rrc.type = ALLOC_REPLY; + rrc.success = 0; + rrc.rid = rc.rid; + rrc.address = addr; + rrc.size = rc.size; + rrc.con = rc.con; + RequestReply *requestReply = new RequestReply(rrc); + rrc.ck->ptr = requestReply; + enqueue_finalize_msg(requestReply); + break; + } + case FREE: { + rrc.type = FREE_REPLY; + rrc.success = allocatorProxy_->release(rc.address); + rrc.rid = rc.rid; + rrc.address = rc.address; + rrc.size = rc.size; + rrc.con = rc.con; + RequestReply *requestReply = new RequestReply(rrc); + rrc.ck->ptr = requestReply; + enqueue_finalize_msg(requestReply); + break; + } + case WRITE: { + rrc.type = WRITE_REPLY; + rrc.success = 0; + rrc.rid = rc.rid; + rrc.address = rc.address; + rrc.src_address = rc.src_address; + rrc.src_rkey = rc.src_rkey; + rrc.size = rc.size; + rrc.con = rc.con; + networkServer_->get_dram_buffer(&rrc); + RequestReply *requestReply = new RequestReply(rrc); + rrc.ck->ptr = requestReply; + + std::unique_lock lk(rrcMtx_); + rrcMap_[rrc.ck->buffer_id] = requestReply; + lk.unlock(); + networkServer_->read(requestReply); + break; + } + case READ: { + rrc.type = READ_REPLY; + rrc.success = 0; + rrc.rid = rc.rid; + rrc.address = rc.address; + rrc.src_address = rc.src_address; + rrc.src_rkey = rc.src_rkey; + rrc.size = rc.size; + rrc.con = rc.con; + rrc.dest_address = allocatorProxy_->get_virtual_address(rrc.address); + rrc.ck = nullptr; + Chunk *base_ck = allocatorProxy_->get_rma_chunk(rrc.address); + networkServer_->get_pmem_buffer(&rrc, base_ck); + RequestReply *requestReply = new RequestReply(rrc); + rrc.ck->ptr = requestReply; + + std::unique_lock lk(rrcMtx_); + rrcMap_[rrc.ck->buffer_id] = requestReply; + lk.unlock(); + networkServer_->write(requestReply); + break; + } + case PUT: { + rrc.type = PUT_REPLY; + rrc.success = 0; + rrc.rid = rc.rid; + rrc.address = rc.address; + rrc.src_address = rc.src_address; + rrc.src_rkey = rc.src_rkey; + rrc.size = rc.size; + rrc.key = rc.key; + rrc.con = rc.con; + networkServer_->get_dram_buffer(&rrc); + RequestReply *requestReply = new RequestReply(rrc); + rrc.ck->ptr = requestReply; + + std::unique_lock lk(rrcMtx_); + rrcMap_[rrc.ck->buffer_id] = requestReply; + lk.unlock(); + networkServer_->read(requestReply); + break; + } + case GET_META: { + rrc.type = GET_META_REPLY; + rrc.success = 0; + rrc.rid = rc.rid; + rrc.size = rc.size; + rrc.key = rc.key; + rrc.con = rc.con; + RequestReply *requestReply = new RequestReply(rrc); + rrc.ck->ptr = requestReply; + enqueue_finalize_msg(requestReply); + } + case DELETE: { + rrc.type = DELETE_REPLY; + rrc.key = rc.key; + rrc.con = rc.con; + rrc.rid = rc.rid; + rrc.success = 0; + } + default: { break; } + } + + delete request; +} + +void Protocol::enqueue_finalize_msg(RequestReply *requestReply) { + finalizeWorker_->addTask(requestReply); +} + +void Protocol::handle_finalize_msg(RequestReply *requestReply) { + RequestReplyContext rrc = requestReply->get_rrc(); + if (rrc.type == PUT_REPLY) { + allocatorProxy_->cache_chunk(rrc.key, rrc.address, rrc.size); + } else if (rrc.type == GET_META_REPLY) { + auto bml = allocatorProxy_->get_cached_chunk(rrc.key); + requestReply->requestReplyContext_.bml = bml; + } else if (rrc.type == DELETE_REPLY) { + auto bml = allocatorProxy_->get_cached_chunk(rrc.key); + for (auto bm : bml) { + rrc.success = allocatorProxy_->release(bm.address); + if (rrc.success) { + break; + } + } + allocatorProxy_->del_chunk(rrc.key); + } else { + } + requestReply->encode(); + networkServer_->send(reinterpret_cast(requestReply->data_), + requestReply->size_, rrc.con); +} + +void Protocol::enqueue_rma_msg(uint64_t buffer_id) { + std::unique_lock lk(rrcMtx_); + RequestReply *requestReply = rrcMap_[buffer_id]; + lk.unlock(); + RequestReplyContext rrc = requestReply->get_rrc(); + if (rrc.address != 0) { + auto wid = GET_WID(rrc.address); + readWorkers_[wid]->addTask(requestReply); + } else { + readWorkers_[rrc.rid % config_->get_pool_size()]->addTask(requestReply); + } +} + +void Protocol::handle_rma_msg(RequestReply *requestReply) { + RequestReplyContext &rrc = requestReply->get_rrc(); + switch (rrc.type) { + case WRITE_REPLY: { + char *buffer = static_cast(rrc.ck->buffer); + if (rrc.address == 0) { + rrc.address = allocatorProxy_->allocate_and_write( + rrc.size, buffer, rrc.rid % config_->get_pool_size()); + } else { + allocatorProxy_->write(rrc.address, buffer, rrc.size); + } + networkServer_->reclaim_dram_buffer(&rrc); + break; + } + case READ_REPLY: { + networkServer_->reclaim_pmem_buffer(&rrc); + break; + } + case PUT_REPLY: { + char *buffer = static_cast(rrc.ck->buffer); + assert(rrc.address == 0); + rrc.address = allocatorProxy_->allocate_and_write( + rrc.size, buffer, rrc.rid % config_->get_pool_size()); + networkServer_->reclaim_dram_buffer(&rrc); + break; + } + default: { break; } + } + enqueue_finalize_msg(requestReply); +} diff --git a/rpmp/pmpool/Protocol.h b/rpmp/pmpool/Protocol.h new file mode 100644 index 00000000..b2cfb679 --- /dev/null +++ b/rpmp/pmpool/Protocol.h @@ -0,0 +1,194 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Protocol.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Thursday, November 7th 2019, 3:48:52 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#ifndef PMPOOL_PROTOCOL_H_ +#define PMPOOL_PROTOCOL_H_ + +#include +#include +#include + +#include +#include // NOLINT +#include +#include +#include // NOLINT +#include +#include + +#include "Event.h" +#include "ThreadWrapper.h" +#include "queue/blockingconcurrentqueue.h" +#include "queue/concurrentqueue.h" + +class Digest; +class AllocatorProxy; +class Protocol; +class NetworkServer; +class Config; +class Log; + +using moodycamel::BlockingConcurrentQueue; +using std::make_shared; + +struct MessageHeader { + MessageHeader(uint8_t msg_type, uint64_t sequence_id) { + msg_type_ = msg_type; + sequence_id_ = sequence_id; + } + uint8_t msg_type_; + uint64_t sequence_id_; + int msg_size; +}; + +class RecvCallback : public Callback { + public: + RecvCallback() = delete; + RecvCallback(Protocol *protocol, ChunkMgr *chunkMgr); + ~RecvCallback() override = default; + void operator()(void *buffer_id, void *buffer_size) override; + + private: + Protocol *protocol_; + ChunkMgr *chunkMgr_; +}; + +class SendCallback : public Callback { + public: + SendCallback() = delete; + explicit SendCallback(ChunkMgr *chunkMgr); + ~SendCallback() override = default; + void operator()(void *buffer_id, void *buffer_size) override; + + private: + ChunkMgr *chunkMgr_; +}; + +class ReadCallback : public Callback { + public: + ReadCallback() = delete; + explicit ReadCallback(Protocol *protocol); + ~ReadCallback() override = default; + void operator()(void *buffer_id, void *buffer_size) override; + + private: + Protocol *protocol_; +}; + +class WriteCallback : public Callback { + public: + WriteCallback() = delete; + explicit WriteCallback(Protocol *protocol); + ~WriteCallback() override = default; + void operator()(void *buffer_id, void *buffer_size) override; + + private: + Protocol *protocol_; +}; + +class RecvWorker : public ThreadWrapper { + public: + RecvWorker() = delete; + RecvWorker(Protocol *protocol, int index); + ~RecvWorker() override = default; + int entry() override; + void abort() override; + void addTask(Request *request); + + private: + Protocol *protocol_; + int index_; + bool init; + BlockingConcurrentQueue pendingRecvRequestQueue_; +}; + +class ReadWorker : public ThreadWrapper { + public: + ReadWorker() = delete; + ReadWorker(Protocol *protocol, int index); + ~ReadWorker() override = default; + int entry() override; + void abort() override; + void addTask(RequestReply *requestReply); + + private: + Protocol *protocol_; + int index_; + bool init; + BlockingConcurrentQueue pendingReadRequestQueue_; +}; + +class FinalizeWorker : public ThreadWrapper { + public: + FinalizeWorker() = delete; + explicit FinalizeWorker(Protocol *protocol); + ~FinalizeWorker() override = default; + int entry() override; + void abort() override; + void addTask(RequestReply *requestReply); + + private: + Protocol *protocol_; + BlockingConcurrentQueue pendingRequestReplyQueue_; +}; + +/** + * @brief Protocol connect NetworkServer and AllocatorProtocol to achieve + * network and storage co-design. Protocol maitains three queues: recv queue, + * finalize queue and rma queue. One thread per queue to handle specific event. + * recv queue-> to handle receive event. + * finalize queue-> to handle finalization event. + * rma queue-> to handle remote memory access event. + */ +class Protocol { + public: + Protocol() = delete; + Protocol(Config *config, Log *log, NetworkServer *server, + AllocatorProxy *allocatorProxy); + ~Protocol(); + int init(); + + friend class RecvCallback; + friend class RecvWorker; + + void enqueue_recv_msg(Request *request); + void handle_recv_msg(Request *request); + + void enqueue_finalize_msg(RequestReply *requestReply); + void handle_finalize_msg(RequestReply *requestReply); + + void enqueue_rma_msg(uint64_t buffer_id); + void handle_rma_msg(RequestReply *requestReply); + + public: + Config *config_; + Log *log_; + + private: + NetworkServer *networkServer_; + AllocatorProxy *allocatorProxy_; + + std::shared_ptr recvCallback_; + std::shared_ptr sendCallback_; + std::shared_ptr readCallback_; + std::shared_ptr writeCallback_; + + BlockingConcurrentQueue recvMsgQueue_; + BlockingConcurrentQueue readMsgQueue_; + + std::vector> recvWorkers_; + std::shared_ptr finalizeWorker_; + std::vector> readWorkers_; + + std::mutex rrcMtx_; + std::unordered_map rrcMap_; + uint64_t time; +}; + +#endif // PMPOOL_PROTOCOL_H_ diff --git a/rpmp/pmpool/RmaBufferRegister.h b/rpmp/pmpool/RmaBufferRegister.h new file mode 100644 index 00000000..d354f962 --- /dev/null +++ b/rpmp/pmpool/RmaBufferRegister.h @@ -0,0 +1,22 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/RmaBuffer.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Tuesday, December 24th 2019, 2:37:40 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#ifndef PMPOOL_RMABUFFERREGISTER_H_ +#define PMPOOL_RMABUFFERREGISTER_H_ + +#include +#include + +class RmaBufferRegister { + public: + virtual Chunk* register_rma_buffer(char* rma_buffer, uint64_t size) = 0; + virtual void unregister_rma_buffer(int buffer_id) = 0; +}; + +#endif // PMPOOL_RMABUFFERREGISTER_H_ diff --git a/rpmp/pmpool/ThreadWrapper.h b/rpmp/pmpool/ThreadWrapper.h new file mode 100644 index 00000000..769cabea --- /dev/null +++ b/rpmp/pmpool/ThreadWrapper.h @@ -0,0 +1,88 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/ThreadWrapper.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool + * Created Date: Thursday, November 7th 2019, 3:48:52 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#ifndef PMPOOL_THREADWRAPPER_H_ +#define PMPOOL_THREADWRAPPER_H_ + +#include + +#include +#include // NOLINT +#include +#include // NOLINT +#include // NOLINT + +class ThreadWrapper { + public: + ThreadWrapper() : done(false) {} + virtual ~ThreadWrapper() = default; + void join() { + if (thread.joinable()) { + thread.join(); + } else { + std::unique_lock l(join_mutex); + join_event.wait(l, [=] { return done.load(); }); + } + } + void start(bool background_thread = false) { + thread = std::thread(&ThreadWrapper::thread_body, this); + if (background_thread) { + thread.detach(); + } + } + void stop() { done.store(true); } + void set_affinity(int cpu) { +#ifdef __linux__ + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + int res = pthread_setaffinity_np(thread.native_handle(), sizeof(cpu_set_t), + &cpuset); + if (res) { + abort(); + } +#endif + } + void thread_body() { + try { + while (true) { + int ret = entry(); + if (done.load() || ret == -1) { + if (!thread.joinable()) { + join_event.notify_all(); + } + break; + } + } + } catch (ThreadAbortException &) { + abort(); + } catch (std::exception &ex) { + ExceptionCaught(ex); + } catch (...) { + UnknownExceptionCaught(); + } + } + + private: + class ThreadAbortException : std::exception {}; + + protected: + virtual int entry() = 0; + virtual void abort() = 0; + virtual void ExceptionCaught(const std::exception &exception) {} + virtual void UnknownExceptionCaught() {} + + private: + std::thread thread; + std::mutex join_mutex; + std::condition_variable join_event; + std::atomic_bool done = {false}; +}; + +#endif // PMPOOL_THREADWRAPPER_H_ diff --git a/rpmp/pmpool/buffer/CircularBuffer.h b/rpmp/pmpool/buffer/CircularBuffer.h new file mode 100644 index 00000000..a41ce3ab --- /dev/null +++ b/rpmp/pmpool/buffer/CircularBuffer.h @@ -0,0 +1,217 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/buffer/CircularBuffer.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool/buffer + * Created Date: Monday, December 23rd 2019, 2:31:42 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#ifndef PMPOOL_BUFFER_CIRCULARBUFFER_H_ +#define PMPOOL_BUFFER_CIRCULARBUFFER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include // NOLINT +#include +#include // NOLINT +#include + +#include "../Common.h" +#include "../NetworkServer.h" +#include "../RmaBufferRegister.h" + +#define p2align(x, a) (((x) + (a)-1) & ~((a)-1)) + +class CircularBuffer { + public: + CircularBuffer() = delete; + CircularBuffer(const CircularBuffer &) = delete; + CircularBuffer(uint64_t buffer_size, uint32_t buffer_num, + bool is_server = false, RmaBufferRegister *rbr = nullptr) + : buffer_size_(buffer_size), + buffer_num_(buffer_num), + rbr_(rbr), + read_(0), + write_(0) { + uint64_t total = buffer_num_ * buffer_size_; + buffer_ = static_cast(mmap(0, buffer_num_ * buffer_size_, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + + // for the consideration of high performance, + // we'd better do memory paging before starting service. + // if (is_server) { + // for (uint64_t i = 0; i < total; i++) { + // buffer_[i] = 0; + // } + // } + + if (rbr_) { + ck_ = rbr_->register_rma_buffer(buffer_, buffer_num_ * buffer_size_); + } + + for (int i = 0; i < buffer_num; i++) { + bits.push_back(0); + } + } + ~CircularBuffer() { + munmap(buffer_, buffer_num_ * buffer_size_); + buffer_ = nullptr; + } + char *get(uint64_t bytes) { + uint64_t offset = 0; + bool res = get(bytes, &offset); + if (res == false) { + return nullptr; + } + return buffer_ + offset * buffer_size_; + } + void put(const char *data, uint64_t bytes) { + assert((data - buffer_) % buffer_size_ == 0); + uint64_t offset = (data - buffer_) / buffer_size_; + put(offset, bytes); + } + + void dump() { + std::cout << "********************************************" << std::endl; + std::cout << "read_ " << read_ << " write_ " << write_ << std::endl; + for (int i = 0; i < buffer_num_; i++) { + std::cout << bits[i] << " "; + } + std::cout << std::endl; + std::cout << "********************************************" << std::endl; + } + uint64_t get_read_() { return read_; } + uint64_t get_write_() { return write_; } + + bool get(uint64_t bytes, uint64_t *offset) { + uint32_t alloc_num = p2align(bytes, buffer_size_) / buffer_size_; + if (alloc_num > buffer_num_) { + return false; + } + std::lock_guard write_lk(write_mtx); + std::unique_lock read_lk(read_mtx); + uint64_t available = 0; + uint64_t end = 0; + uint64_t index = 0; + read_lt_write: + if (write_ >= read_) { // --------read_--------write_-------- + available = buffer_num_ - write_; + if (available >= alloc_num) { + index = write_; + end = write_ + alloc_num; + while (index < end) { + bits[index++] = 1; + } + *offset = write_; + write_ += alloc_num; + if (write_ == buffer_num_) { + write_ = 0; + } + goto success; + } else { + uint64_t index = write_; + while (index < buffer_num_) { + bits[index++] = 0; + } + write_ = 0; + goto write_lt_read; + } + } + write_lt_read: + // --------write_--------read_----------- + available = read_ - write_; + if (available >= alloc_num) { + index = write_; + end = write_ + alloc_num; + while (index < end) { + bits[index++] = 1; + } + *offset = write_; + write_ += alloc_num; + if (write_ == buffer_num_) { + write_ = 0; + } + goto success; + } else { + // wait + while ((available = read_ - write_) < alloc_num) { + read_cv.wait(read_lk); + if (read_ == 0) { + goto read_lt_write; + } + } + index = write_; + end = write_ + alloc_num; + while (index < end) { + bits[index++] = 1; + } + *offset = write_; + write_ += alloc_num; + if (write_ == buffer_num_) { + write_ = 0; + } + goto success; + } + success: + return true; + } + void put(uint64_t offset, uint64_t bytes) { + uint32_t alloc_num = p2align(bytes, buffer_size_) / buffer_size_; + assert(alloc_num <= buffer_num_ - read_); + std::unique_lock read_lk(read_mtx); + uint64_t index = offset; + uint64_t end = index + alloc_num; + while (index < end) { + bits[index] = 0; + if (read_ == index) { + read_++; + if (read_ == buffer_num_) { + read_ = 0; + } + } + index++; + read_cv.notify_all(); + } + index = read_; + while (bits[index] == 0) { + read_++; + index++; + if (read_ == buffer_num_) { + read_ = 0; + read_cv.notify_all(); + break; + } else { + read_cv.notify_all(); + } + } + } + Chunk *get_rma_chunk() { return ck_; } + uint64_t get_offset(uint64_t data) { return (data - (uint64_t)buffer_); } + + private: + char *buffer_; + char *tmp_; + uint64_t buffer_size_; + uint64_t buffer_num_; + RmaBufferRegister *rbr_; + Chunk *ck_; + std::vector bits; + uint64_t read_; + uint64_t write_; + std::mutex read_mtx; + std::condition_variable read_cv; + spin_mutex write_mtx; + char tmp[4096]; +}; + +#endif // PMPOOL_BUFFER_CIRCULARBUFFER_H_ diff --git a/rpmp/pmpool/client/NetworkClient.cc b/rpmp/pmpool/client/NetworkClient.cc new file mode 100644 index 00000000..1e32cba9 --- /dev/null +++ b/rpmp/pmpool/client/NetworkClient.cc @@ -0,0 +1,262 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/client/NetworkClient.cc + * Path: /mnt/spark-pmof/tool/rpmp/pmpool/client + * Created Date: Monday, December 16th 2019, 1:16:16 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include "pmpool/client/NetworkClient.h" + +#include +#include +#include + +#include "../Event.h" +#include "../buffer/CircularBuffer.h" + +uint64_t timestamp_now() { + return std::chrono::high_resolution_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); +} + +RequestHandler::RequestHandler(NetworkClient *networkClient) + : networkClient_(networkClient) {} + +void RequestHandler::addTask(Request *request) { handleRequest(request); } + +void RequestHandler::addTask(Request *request, std::function func) { + callback_map[request->get_rc().rid] = func; + handleRequest(request); +} + +void RequestHandler::wait() { + unique_lock lk(h_mtx); + while (!op_finished) { + cv.wait(lk); + } +} + +void RequestHandler::notify(RequestReply *requestReply) { + unique_lock lk(h_mtx); + requestReplyContext = requestReply->get_rrc(); + op_finished = true; + if (callback_map.count(requestReplyContext.rid) != 0) { + callback_map[requestReplyContext.rid](); + callback_map.erase(requestReplyContext.rid); + } else { + cv.notify_one(); + lk.unlock(); + } +} + +void RequestHandler::handleRequest(Request *request) { + op_finished = false; + OpType rt = request->get_rc().type; + switch (rt) { + case ALLOC: { + request->encode(); + networkClient_->send(reinterpret_cast(request->data_), + request->size_); + break; + } + case FREE: { + request->encode(); + networkClient_->send(reinterpret_cast(request->data_), + request->size_); + break; + } + case WRITE: { + request->encode(); + networkClient_->send(reinterpret_cast(request->data_), + request->size_); + break; + } + case READ: { + request->encode(); + networkClient_->send(reinterpret_cast(request->data_), + request->size_); + break; + } + case PUT: { + request->encode(); + networkClient_->send(reinterpret_cast(request->data_), + request->size_); + } + case GET_META: { + request->encode(); + networkClient_->send(reinterpret_cast(request->data_), + request->size_); + } + default: {} + } +} + +RequestReplyContext &RequestHandler::get() { return requestReplyContext; } + +ClientConnectedCallback::ClientConnectedCallback(NetworkClient *networkClient) { + networkClient_ = networkClient; +} + +void ClientConnectedCallback::operator()(void *param_1, void *param_2) { + auto con = static_cast(param_1); + networkClient_->connected(con); +} + +ClientRecvCallback::ClientRecvCallback(ChunkMgr *chunkMgr, + RequestHandler *requestHandler) + : chunkMgr_(chunkMgr), requestHandler_(requestHandler) {} + +void ClientRecvCallback::operator()(void *param_1, void *param_2) { + int mid = *static_cast(param_1); + auto ck = chunkMgr_->get(mid); + + // test start + // auto con = reinterpret_cast(ck->con); + // if (count_ == 0) { + // start = timestamp_now(); + // } + // count_++; + // if (count_ >= 1000000) { + // end = timestamp_now(); + // std::cout << "consumes " << (end-start)/1000.0 << std::endl; + // return; + // } + // RequestContext rc = {}; + // rc.type = READ; + // rc.rid = 0; + // rc.size = 0; + // rc.address = 0; + // Request request(rc); + // request.encode(); + // auto new_ck = chunkMgr_->get(con); + // memcpy(new_ck->buffer, request.data_, request.size_); + // new_ck->size = request.size_; + // con->send(new_ck); + // test end + + RequestReply requestReply(reinterpret_cast(ck->buffer), ck->size, + reinterpret_cast(ck->con)); + requestReply.decode(); + RequestReplyContext rrc = requestReply.get_rrc(); + switch (rrc.type) { + case ALLOC_REPLY: { + requestHandler_->notify(&requestReply); + break; + } + case FREE_REPLY: { + requestHandler_->notify(&requestReply); + break; + } + case WRITE_REPLY: { + requestHandler_->notify(&requestReply); + break; + } + case READ_REPLY: { + requestHandler_->notify(&requestReply); + break; + } + default: {} + } + chunkMgr_->reclaim(ck, static_cast(ck->con)); +} + +NetworkClient::NetworkClient(const string &remote_address, + const string &remote_port) + : NetworkClient(remote_address, remote_port, 1, 32, 65536, 64) {} + +NetworkClient::NetworkClient(const string &remote_address, + const string &remote_port, int worker_num, + int buffer_num_per_con, int buffer_size, + int init_buffer_num) + : remote_address_(remote_address), + remote_port_(remote_port), + worker_num_(worker_num), + buffer_num_per_con_(buffer_num_per_con), + buffer_size_(buffer_size), + init_buffer_num_(init_buffer_num), + connected_(false) {} + +NetworkClient::~NetworkClient() { + delete shutdownCallback; + delete connectedCallback; + delete sendCallback; + delete recvCallback; +} + +int NetworkClient::init(RequestHandler *requestHandler) { + client_ = new Client(worker_num_, buffer_num_per_con_); + if ((client_->init()) != 0) { + return -1; + } + chunkMgr_ = new ChunkPool(client_, buffer_size_, init_buffer_num_); + + client_->set_chunk_mgr(chunkMgr_); + + shutdownCallback = new ClientShutdownCallback(); + connectedCallback = new ClientConnectedCallback(this); + recvCallback = new ClientRecvCallback(chunkMgr_, requestHandler); + sendCallback = new ClientSendCallback(chunkMgr_); + + client_->set_shutdown_callback(shutdownCallback); + client_->set_connected_callback(connectedCallback); + client_->set_recv_callback(recvCallback); + client_->set_send_callback(sendCallback); + + client_->start(); + int res = client_->connect(remote_address_.c_str(), remote_port_.c_str()); + unique_lock lk(con_mtx); + while (!connected_) { + con_v.wait(lk); + } + + circularBuffer_ = make_shared(1024 * 1024, 512, false, this); +} + +void NetworkClient::shutdown() { client_->shutdown(); } + +void NetworkClient::wait() { client_->wait(); } + +Chunk *NetworkClient::register_rma_buffer(char *rma_buffer, uint64_t size) { + return client_->reg_rma_buffer(rma_buffer, size, buffer_id_++); +} + +void NetworkClient::unregister_rma_buffer(int buffer_id) { + client_->unreg_rma_buffer(buffer_id); +} + +uint64_t NetworkClient::get_dram_buffer(const char *data, uint64_t size) { + char *dest = circularBuffer_->get(size); + if (data) { + memcpy(dest, data, size); + } + return (uint64_t)dest; +} + +void NetworkClient::reclaim_dram_buffer(uint64_t src_address, uint64_t size) { + circularBuffer_->put(reinterpret_cast(src_address), size); +} + +uint64_t NetworkClient::get_rkey() { + return circularBuffer_->get_rma_chunk()->mr->key; +} + +void NetworkClient::connected(Connection *con) { + std::unique_lock lk(con_mtx); + con_ = con; + connected_ = true; + con_v.notify_all(); + lk.unlock(); +} + +void NetworkClient::send(char *data, uint64_t size) { + auto ck = chunkMgr_->get(con_); + std::memcpy(reinterpret_cast(ck->buffer), data, size); + ck->size = size; + con_->send(ck); +} + +void NetworkClient::read(Request *request) { + RequestContext rc = request->get_rc(); +} diff --git a/rpmp/pmpool/client/NetworkClient.h b/rpmp/pmpool/client/NetworkClient.h new file mode 100644 index 00000000..f9e4f54c --- /dev/null +++ b/rpmp/pmpool/client/NetworkClient.h @@ -0,0 +1,167 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/client/NetworkClient.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool/client + * Created Date: Wednesday, December 11th 2019, 2:02:46 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#ifndef PMPOOL_CLIENT_NETWORKCLIENT_H_ +#define PMPOOL_CLIENT_NETWORKCLIENT_H_ + +#include +#include + +#include +#include // NOLINT +#include +#include // NOLINT +#include +#include +#include +#include + +#include "../Event.h" +#include "../RmaBufferRegister.h" +#include "../ThreadWrapper.h" +#include "../queue/blockingconcurrentqueue.h" +#include "../queue/concurrentqueue.h" + +using moodycamel::BlockingConcurrentQueue; +using std::atomic; +using std::condition_variable; +using std::future; +using std::make_shared; +using std::mutex; +using std::promise; +using std::shared_ptr; +using std::string; +using std::unique_lock; +using std::unordered_map; + +class NetworkClient; +class CircularBuffer; +class Connection; +class ChunkMgr; + +typedef promise Promise; +typedef future Future; + +class RequestHandler { + public: + explicit RequestHandler(NetworkClient *networkClient); + ~RequestHandler() = default; + void addTask(Request *request); + void addTask(Request *request, std::function func); + void notify(RequestReply *requestReply); + void wait(); + RequestReplyContext &get(); + + private: + void handleRequest(Request *request); + + private: + NetworkClient *networkClient_; + BlockingConcurrentQueue pendingRequestQueue_; + std::mutex h_mtx; + unordered_map> callback_map; + uint64_t total_num = 0; + uint64_t begin = 0; + uint64_t end = 0; + uint64_t time = 0; + bool op_finished = false; + std::condition_variable cv; + RequestReplyContext requestReplyContext; +}; + +class ClientShutdownCallback : public Callback { + public: + ClientShutdownCallback() {} + ~ClientShutdownCallback() = default; + void operator()(void *param_1, void *param_2) {} +}; + +class ClientConnectedCallback : public Callback { + public: + explicit ClientConnectedCallback(NetworkClient *networkClient); + ~ClientConnectedCallback() = default; + void operator()(void *param_1, void *param_2); + + private: + NetworkClient *networkClient_; +}; + +class ClientRecvCallback : public Callback { + public: + ClientRecvCallback(ChunkMgr *chunkMgr, RequestHandler *requestHandler); + ~ClientRecvCallback() = default; + void operator()(void *param_1, void *param_2); + + private: + ChunkMgr *chunkMgr_; + RequestHandler *requestHandler_; + uint64_t count_ = 0; + uint64_t time = 0; + uint64_t start = 0; + uint64_t end = 0; + std::mutex mtx; +}; + +class ClientSendCallback : public Callback { + public: + explicit ClientSendCallback(ChunkMgr *chunkMgr) : chunkMgr_(chunkMgr) {} + ~ClientSendCallback() = default; + void operator()(void *param_1, void *param_2) { + auto buffer_id_ = *static_cast(param_1); + auto ck = chunkMgr_->get(buffer_id_); + chunkMgr_->reclaim(ck, static_cast(ck->con)); + } + + private: + ChunkMgr *chunkMgr_; +}; + +class NetworkClient : public RmaBufferRegister { + public: + friend ClientConnectedCallback; + NetworkClient() = delete; + NetworkClient(const string &remote_address, const string &remote_port); + NetworkClient(const string &remote_address, const string &remote_port, + int worker_num, int buffer_num_per_con, int buffer_size, + int init_buffer_num); + ~NetworkClient(); + int init(RequestHandler *requesthandler); + void shutdown(); + void wait(); + Chunk *register_rma_buffer(char *rma_buffer, uint64_t size) override; + void unregister_rma_buffer(int buffer_id) override; + uint64_t get_dram_buffer(const char *data, uint64_t size); + void reclaim_dram_buffer(uint64_t src_address, uint64_t size); + uint64_t get_rkey(); + void connected(Connection *con); + void send(char *data, uint64_t size); + void read(Request *request); + + private: + string remote_address_; + string remote_port_; + int worker_num_; + int buffer_num_per_con_; + int buffer_size_; + int init_buffer_num_; + Client *client_; + ChunkMgr *chunkMgr_; + Connection *con_; + ClientShutdownCallback *shutdownCallback; + ClientConnectedCallback *connectedCallback; + ClientRecvCallback *recvCallback; + ClientSendCallback *sendCallback; + mutex con_mtx; + bool connected_; + condition_variable con_v; + shared_ptr circularBuffer_; + atomic buffer_id_{0}; +}; + +#endif // PMPOOL_CLIENT_NETWORKCLIENT_H_ diff --git a/rpmp/pmpool/client/PmPoolClient.cc b/rpmp/pmpool/client/PmPoolClient.cc new file mode 100644 index 00000000..8bc72e1c --- /dev/null +++ b/rpmp/pmpool/client/PmPoolClient.cc @@ -0,0 +1,192 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/client/PmPoolClient.cc + * Path: /mnt/spark-pmof/tool/rpmp/pmpool/client + * Created Date: Friday, December 13th 2019, 3:44:08 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include "pmpool/client/PmPoolClient.h" + +#include "pmpool/Digest.h" +#include "pmpool/Event.h" +#include "pmpool/Protocol.h" +#include "NetworkClient.h" + +PmPoolClient::PmPoolClient(const string &remote_address, + const string &remote_port) { + tx_finished = true; + op_finished = false; + networkClient_ = make_shared(remote_address, remote_port); + requestHandler_ = make_shared(networkClient_.get()); +} + +PmPoolClient::~PmPoolClient() {} + +int PmPoolClient::init() { networkClient_->init(requestHandler_.get()); } + +void PmPoolClient::begin_tx() { + std::unique_lock lk(tx_mtx); + while (!tx_finished) { + tx_con.wait(lk); + } + tx_finished; +} + +uint64_t PmPoolClient::alloc(uint64_t size) { + RequestContext rc = {}; + rc.type = ALLOC; + rc.rid = rid_++; + rc.size = size; + Request request(rc); + requestHandler_->addTask(&request); + requestHandler_->wait(); + return requestHandler_->get().address; +} + +int PmPoolClient::free(uint64_t address) { + RequestContext rc = {}; + rc.type = FREE; + rc.rid = rid_++; + rc.address = address; + Request request(rc); + requestHandler_->addTask(&request); + requestHandler_->wait(); + return requestHandler_->get().success; +} + +void PmPoolClient::shutdown() { networkClient_->shutdown(); } + +void PmPoolClient::wait() { networkClient_->wait(); } + +int PmPoolClient::write(uint64_t address, const char *data, uint64_t size) { + RequestContext rc = {}; + rc.type = WRITE; + rc.rid = rid_++; + rc.size = size; + rc.address = address; + // allocate memory for RMA read from client. + rc.src_address = networkClient_->get_dram_buffer(data, rc.size); + rc.src_rkey = networkClient_->get_rkey(); + Request request(rc); + requestHandler_->addTask(&request); + requestHandler_->wait(); + auto res = requestHandler_->get().success; + networkClient_->reclaim_dram_buffer(rc.src_address, rc.size); + return res; +} + +uint64_t PmPoolClient::write(const char *data, uint64_t size) { + RequestContext rc = {}; + rc.type = WRITE; + rc.rid = rid_++; + rc.size = size; + rc.address = 0; + // allocate memory for RMA read from client. + rc.src_address = networkClient_->get_dram_buffer(data, rc.size); + rc.src_rkey = networkClient_->get_rkey(); + Request request(rc); + requestHandler_->addTask(&request); + requestHandler_->wait(); + auto res = requestHandler_->get().address; + networkClient_->reclaim_dram_buffer(rc.src_address, rc.size); + return res; +} + +int PmPoolClient::read(uint64_t address, char *data, uint64_t size) { + RequestContext rc = {}; + rc.type = READ; + rc.rid = rid_++; + rc.size = size; + rc.address = address; + // allocate memory for RMA read from client. + rc.src_address = networkClient_->get_dram_buffer(nullptr, rc.size); + rc.src_rkey = networkClient_->get_rkey(); + Request request(rc); + requestHandler_->addTask(&request); + requestHandler_->wait(); + auto res = requestHandler_->get().success; + if (!res) { + memcpy(data, reinterpret_cast(rc.src_address), size); + } + networkClient_->reclaim_dram_buffer(rc.src_address, rc.size); + return res; +} + +int PmPoolClient::read(uint64_t address, char *data, uint64_t size, + std::function func) { + RequestContext rc = {}; + rc.type = READ; + rc.rid = rid_++; + rc.size = size; + rc.address = address; + // allocate memory for RMA read from client. + rc.src_address = networkClient_->get_dram_buffer(nullptr, rc.size); + rc.src_rkey = networkClient_->get_rkey(); + Request request(rc); + requestHandler_->addTask(&request, [&] { + auto res = requestHandler_->get().success; + if (res) { + memcpy(data, reinterpret_cast(rc.src_address), size); + } + networkClient_->reclaim_dram_buffer(rc.src_address, rc.size); + func(res); + }); + return 0; +} + +void PmPoolClient::end_tx() { + std::lock_guard lk(tx_mtx); + tx_finished = true; + tx_con.notify_one(); +} + +int PmPoolClient::put(const string &key, const char *value, uint64_t size) { + uint64_t key_uint; + Digest::computeKeyHash(key, &key_uint); + RequestContext rc = {}; + rc.type = PUT; + rc.rid = rid_++; + rc.size = size; + rc.address = 0; + // allocate memory for RMA read from client. + rc.src_address = networkClient_->get_dram_buffer(value, rc.size); + rc.src_rkey = networkClient_->get_rkey(); + rc.key = key_uint; + Request request(rc); + requestHandler_->addTask(&request); + requestHandler_->wait(); + auto res = requestHandler_->get().address; + networkClient_->reclaim_dram_buffer(rc.src_address, rc.size); + return res; +} + +vector PmPoolClient::get(const string &key) { + uint64_t key_uint; + Digest::computeKeyHash(key, &key_uint); + RequestContext rc = {}; + rc.type = GET_META; + rc.rid = rid_++; + rc.address = 0; + rc.key = key_uint; + Request request(rc); + requestHandler_->addTask(&request); + requestHandler_->wait(); + auto bml = requestHandler_->get().bml; + return bml; +} + +int PmPoolClient::del(const string &key) { + uint64_t key_uint; + Digest::computeKeyHash(key, &key_uint); + RequestContext rc = {}; + rc.type = DELETE; + rc.rid = rid_++; + rc.key = key_uint; + Request request(rc); + requestHandler_->addTask(&request); + requestHandler_->wait(); + auto res = requestHandler_->get().success; + return res; +} diff --git a/rpmp/pmpool/client/PmPoolClient.h b/rpmp/pmpool/client/PmPoolClient.h new file mode 100644 index 00000000..3cd2964d --- /dev/null +++ b/rpmp/pmpool/client/PmPoolClient.h @@ -0,0 +1,100 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/client/PmPoolClient.h + * Path: /mnt/spark-pmof/tool/rpmp/pmpool/client + * Created Date: Friday, December 13th 2019, 3:43:04 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#ifndef PMPOOL_CLIENT_PMPOOLCLIENT_H_ +#define PMPOOL_CLIENT_PMPOOLCLIENT_H_ + +#define INITIAL_BUFFER_NUMBER 64 + +#include +#include +#include +#include + +#include +#include // NOLINT +#include +#include // NOLINT +#include +#include +#include // NOLINT +#include +#include +#include +#include + +#include "../Common.h" +#include "../Base.h" +#include "../ThreadWrapper.h" + +class NetworkClient; +class RequestHandler; +class Function; + +using std::atomic; +using std::make_shared; +using std::shared_ptr; +using std::string; +using std::vector; + +class PmPoolClient { + public: + PmPoolClient() = delete; + PmPoolClient(const string &remote_address, const string &remote_port); + ~PmPoolClient(); + int init(); + + /// memory pool interface + void begin_tx(); + /// Allocate the given size of memory from remote memory pool. + /// Return the global address of memory pool. + uint64_t alloc(uint64_t size); + + /// Free memory with the global address. + /// Address is the global address that returned by alloc. + /// Return 0 if succeed, return others value if fail. + int free(uint64_t address); + + /// Write data to the address of remote memory pool. + /// The size is number of bytes + /// Return 0 if succeed, return others value if fail. + int write(uint64_t address, const char *data, uint64_t size); + + /// Return global address if succeed, return -1 if fail. + uint64_t write(const char *data, uint64_t size); + + /// Read from the global address of remote memory pool and copy to data + /// pointer. + /// Return 0 if succeed, return others value if fail. + int read(uint64_t address, char *data, uint64_t size); + + int read(uint64_t address, char *data, uint64_t size, + std::function func); + void end_tx(); + + /// key-value storage interface + int put(const string &key, const char *value, uint64_t size); + vector get(const string &key); + int del(const string &key); + + void shutdown(); + void wait(); + + private: + shared_ptr requestHandler_; + shared_ptr networkClient_; + atomic rid_ = {0}; + std::mutex tx_mtx; + std::condition_variable tx_con; + bool tx_finished; + std::mutex op_mtx; + bool op_finished; +}; + +#endif // PMPOOL_CLIENT_PMPOOLCLIENT_H_ diff --git a/rpmp/pmpool/client/java/rpmp/.classpath b/rpmp/pmpool/client/java/rpmp/.classpath new file mode 100644 index 00000000..71f5fefe --- /dev/null +++ b/rpmp/pmpool/client/java/rpmp/.classpath @@ -0,0 +1,44 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/rpmp/pmpool/client/java/rpmp/.project b/rpmp/pmpool/client/java/rpmp/.project new file mode 100644 index 00000000..066a77e0 --- /dev/null +++ b/rpmp/pmpool/client/java/rpmp/.project @@ -0,0 +1,23 @@ + + + rpmp + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + + diff --git a/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.core.resources.prefs b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 00000000..f9fe3459 --- /dev/null +++ b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,4 @@ +eclipse.preferences.version=1 +encoding//src/main/java=UTF-8 +encoding//src/test/java=UTF-8 +encoding/=UTF-8 diff --git a/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.jdt.apt.core.prefs b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.jdt.apt.core.prefs new file mode 100644 index 00000000..d4313d4b --- /dev/null +++ b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.jdt.apt.core.prefs @@ -0,0 +1,2 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.apt.aptEnabled=false diff --git a/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.jdt.core.prefs b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 00000000..b11489fa --- /dev/null +++ b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,9 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 +org.eclipse.jdt.core.compiler.compliance=1.7 +org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore +org.eclipse.jdt.core.compiler.processAnnotations=disabled +org.eclipse.jdt.core.compiler.release=disabled +org.eclipse.jdt.core.compiler.source=1.7 diff --git a/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.m2e.core.prefs b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.m2e.core.prefs new file mode 100644 index 00000000..f897a7f1 --- /dev/null +++ b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.m2e.core.prefs @@ -0,0 +1,4 @@ +activeProfiles= +eclipse.preferences.version=1 +resolveWorkspaceProjects=true +version=1 diff --git a/rpmp/pmpool/client/java/rpmp/pom.xml b/rpmp/pmpool/client/java/rpmp/pom.xml new file mode 100644 index 00000000..65310b2a --- /dev/null +++ b/rpmp/pmpool/client/java/rpmp/pom.xml @@ -0,0 +1,75 @@ + + + + 4.0.0 + + com.intel.rpmp + rpmp + 0.1 + + rpmp + + http://www.example.com + + + UTF-8 + 1.7 + 1.7 + + + + + junit + junit + 4.11 + test + + + + + + + + + maven-clean-plugin + 3.1.0 + + + + maven-resources-plugin + 3.0.2 + + + maven-compiler-plugin + 3.8.0 + + + maven-surefire-plugin + 2.22.1 + + + maven-jar-plugin + 3.0.2 + + + maven-install-plugin + 2.5.2 + + + maven-deploy-plugin + 2.8.2 + + + + maven-site-plugin + 3.7.1 + + + maven-project-info-reports-plugin + 3.0.0 + + + + + diff --git a/rpmp/pmpool/client/java/rpmp/src/main/java/com/intel/rpmp/PmPoolClient.java b/rpmp/pmpool/client/java/rpmp/src/main/java/com/intel/rpmp/PmPoolClient.java new file mode 100644 index 00000000..1d431620 --- /dev/null +++ b/rpmp/pmpool/client/java/rpmp/src/main/java/com/intel/rpmp/PmPoolClient.java @@ -0,0 +1,115 @@ +package com.intel.rpmp; + +import java.io.IOException; +import java.lang.reflect.Constructor; +import java.nio.ByteBuffer; + +/** + * PmPoolClient + * + */ +public class PmPoolClient { + static { + System.loadLibrary("pmpool"); + } + + public PmPoolClient(String remote_address, String remote_port) { + objectId = newPmPoolClient_(remote_address, remote_port); + } + + public long alloc(long size) { + return alloc_(size, objectId); + } + + public int free(long address) { + return free_(address, objectId); + } + + public int write(long address, String data, long size) { + return write_(address, data, size, objectId); + } + + public long write(String data, long size) { + return alloc_and_write_(data, size, objectId); + } + + public long write(ByteBuffer data, long size) { + return alloc_and_write_(data, size, objectId); + } + + public int read(long address, long size, ByteBuffer byteBuffer) { + return read_(address, size, byteBuffer, objectId); + } + + public int put(String key, ByteBuffer data, long size) { + return put(key, data, size, objectId); + } + + public int del(String key) { + return del(key, objectId); + } + + public void shutdown() { + shutdown_(objectId); + } + + public void waitToStop() { + waitToStop_(objectId); + } + + public void dispose() { + dispose_(objectId); + } + + private ByteBuffer convertToByteBuffer(long address, int length) throws IOException { + Class classDirectByteBuffer; + try { + classDirectByteBuffer = Class.forName("java.nio.DirectByteBuffer"); + } catch (ClassNotFoundException e) { + throw new IOException("java.nio.DirectByteBuffer class not found"); + } + Constructor constructor; + try { + constructor = classDirectByteBuffer.getDeclaredConstructor(long.class, int.class); + } catch (NoSuchMethodException e) { + throw new IOException("java.nio.DirectByteBuffer constructor not found"); + } + constructor.setAccessible(true); + ByteBuffer byteBuffer; + try { + byteBuffer = (ByteBuffer) constructor.newInstance(address, length); + } catch (Exception e) { + throw new IOException("java.nio.DirectByteBuffer exception: " + e.toString()); + } + + return byteBuffer; + } + + private native long newPmPoolClient_(String remote_address, String remote_port); + + private native long alloc_(long size, long objectId); + + private native int free_(long address, long objectId); + + private native int write_(long address, String data, long size, long objectId); + + private native long alloc_and_write_(String data, long size, long objectId); + + private native long alloc_and_write_(ByteBuffer data, long size, long objectId); + + private native int put(String key, ByteBuffer data, long size, long objectId); + + private native long[] getMeta(String key, long objectId); + + private native int del(String key, long objectId); + + private native int read_(long address, long size, ByteBuffer byteBuffer, long objectId); + + private native void shutdown_(long objectId); + + private native void waitToStop_(long objectId); + + private native void dispose_(long objectId); + + private long objectId; +} diff --git a/rpmp/pmpool/client/java/rpmp/src/test/java/com/intel/rpmp/PmPoolClientTest.java b/rpmp/pmpool/client/java/rpmp/src/test/java/com/intel/rpmp/PmPoolClientTest.java new file mode 100644 index 00000000..2f4669ad --- /dev/null +++ b/rpmp/pmpool/client/java/rpmp/src/test/java/com/intel/rpmp/PmPoolClientTest.java @@ -0,0 +1,94 @@ +package com.intel.rpmp; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import java.nio.ByteBuffer; +import java.util.Random; + +/** + * You need to start rpmp service before running the following tests. + */ +public class PmPoolClientTest +{ + @Before + public void setup() { + pmPoolClient = new PmPoolClient("172.168.0.40", "12346"); + } + + @After + public void tear() { + pmPoolClient.shutdown(); + pmPoolClient.waitToStop(); + pmPoolClient.dispose(); + } + + @Test + public void remoteAlloc() { + for (int i = 0; i < 100; i++) { + long address = pmPoolClient.alloc(4096); + assertTrue(address > 0); + } + } + + @Test + public void remoteWrite() { + Random rand = new Random(); + for (int i = 0; i < 100; i++) { + long address = pmPoolClient.alloc(rand.nextInt((1024*1024*8))); + assertTrue(address > 0); + String data = "hello"; + assertEquals(0, pmPoolClient.write(address, data, data.length())); + } + } + + + @Test + public void remoteRead() + { + long address = pmPoolClient.alloc(4096); + assertTrue(address > 0); + String data = "hello"; + assertEquals(0, pmPoolClient.write(address, data, data.length())); + ByteBuffer byteBuffer = ByteBuffer.allocateDirect(4096); + ByteBuffer testBuffer = ByteBuffer.allocateDirect(4096); + for (int i = 0; i < 5; i++) { + testBuffer.put(data.getBytes()[i]); + } + testBuffer.flip(); + assertEquals(0, pmPoolClient.read(address, 5, byteBuffer)); + for (int i = 0; i < 5; i++) { + assertEquals(true, (char)byteBuffer.get() == (char)testBuffer.get()); + } + } + + @Test + public void remoteAllocAndWrite() { + for (int i = 0; i < 100; i++) { + String data = "hello"; + assertTrue(pmPoolClient.write(data, data.length()) > 0); + } + } + + public void remoteAllocAndWriteThenRead() + { + String data = "hello"; + long address = pmPoolClient.write(data, data.length()); + assertTrue(address > 0); + ByteBuffer byteBuffer = ByteBuffer.allocateDirect(4096); + ByteBuffer testBuffer = ByteBuffer.allocateDirect(4096); + for (int i = 0; i < 5; i++) { + testBuffer.put(data.getBytes()[i]); + } + testBuffer.flip(); + assertEquals(0, pmPoolClient.read(address, 5, byteBuffer)); + for (int i = 0; i < 5; i++) { + assertEquals(true, (char)byteBuffer.get() == (char)testBuffer.get()); + } + } + + private PmPoolClient pmPoolClient; +} diff --git a/rpmp/pmpool/client/java/rpmp/target/classes/com/intel/rpmp/PmPoolClient.class b/rpmp/pmpool/client/java/rpmp/target/classes/com/intel/rpmp/PmPoolClient.class new file mode 100644 index 00000000..2cdedac0 Binary files /dev/null and b/rpmp/pmpool/client/java/rpmp/target/classes/com/intel/rpmp/PmPoolClient.class differ diff --git a/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst b/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst new file mode 100644 index 00000000..e69de29b diff --git a/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst b/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst new file mode 100644 index 00000000..ab698a7d --- /dev/null +++ b/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst @@ -0,0 +1 @@ +/mnt/spark-pmof/tool/rpmp/pmpool/client/java/rpmp/src/main/java/com/intel/rpmp/PmPoolClient.java diff --git a/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst b/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst new file mode 100644 index 00000000..e69de29b diff --git a/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst b/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst new file mode 100644 index 00000000..dfd02fe7 --- /dev/null +++ b/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst @@ -0,0 +1 @@ +/mnt/spark-pmof/tool/rpmp/pmpool/client/java/rpmp/src/test/java/com/intel/rpmp/PmPoolClientTest.java diff --git a/rpmp/pmpool/client/java/rpmp/target/surefire-reports/TEST-com.intel.rpmp.PmPoolClientTest.xml b/rpmp/pmpool/client/java/rpmp/target/surefire-reports/TEST-com.intel.rpmp.PmPoolClientTest.xml new file mode 100644 index 00000000..33c46b59 --- /dev/null +++ b/rpmp/pmpool/client/java/rpmp/target/surefire-reports/TEST-com.intel.rpmp.PmPoolClientTest.xml @@ -0,0 +1,64 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/rpmp/pmpool/client/java/rpmp/target/surefire-reports/com.intel.rpmp.PmPoolClientTest.txt b/rpmp/pmpool/client/java/rpmp/target/surefire-reports/com.intel.rpmp.PmPoolClientTest.txt new file mode 100644 index 00000000..53952175 --- /dev/null +++ b/rpmp/pmpool/client/java/rpmp/target/surefire-reports/com.intel.rpmp.PmPoolClientTest.txt @@ -0,0 +1,4 @@ +------------------------------------------------------------------------------- +Test set: com.intel.rpmp.PmPoolClientTest +------------------------------------------------------------------------------- +Tests run: 4, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 2.63 s - in com.intel.rpmp.PmPoolClientTest diff --git a/rpmp/pmpool/client/java/rpmp/target/test-classes/com/intel/rpmp/PmPoolClientTest.class b/rpmp/pmpool/client/java/rpmp/target/test-classes/com/intel/rpmp/PmPoolClientTest.class new file mode 100644 index 00000000..d196a320 Binary files /dev/null and b/rpmp/pmpool/client/java/rpmp/target/test-classes/com/intel/rpmp/PmPoolClientTest.class differ diff --git a/rpmp/pmpool/client/native/com_intel_rpmp_PmPoolClient.cc b/rpmp/pmpool/client/native/com_intel_rpmp_PmPoolClient.cc new file mode 100644 index 00000000..75cbca8a --- /dev/null +++ b/rpmp/pmpool/client/native/com_intel_rpmp_PmPoolClient.cc @@ -0,0 +1,168 @@ +/* + * Filename: + * /mnt/spark-pmof/tool/rpmp/pmpool/client/native/PmPoolClientNative.cc Path: + * /mnt/spark-pmof/tool/rpmp/pmpool/client/native Created Date: Monday, February + * 24th 2020, 9:23:22 pm Author: root + * + * Copyright (c) 2020 Intel + */ +#include + +#include "pmpool/client/PmPoolClient.h" +#include "pmpool/client/native/com_intel_rpmp_PmPoolClient.h" + +JNIEXPORT jlong JNICALL Java_com_intel_rpmp_PmPoolClient_newPmPoolClient_1( + JNIEnv *env, jobject obj, jstring address, jstring port) { + const char *remote_address = env->GetStringUTFChars(address, 0); + const char *remote_port = env->GetStringUTFChars(port, 0); + + PmPoolClient *client = new PmPoolClient(remote_address, remote_port); + client->begin_tx(); + client->init(); + client->end_tx(); + + env->ReleaseStringUTFChars(address, remote_address); + env->ReleaseStringUTFChars(port, remote_port); + + return reinterpret_cast(client); +} + +JNIEXPORT jlong JNICALL Java_com_intel_rpmp_PmPoolClient_alloc_1( + JNIEnv *env, jobject obj, jlong size, jlong objectId) { + PmPoolClient *client = reinterpret_cast(objectId); + client->begin_tx(); + uint64_t address = client->alloc(size); + client->end_tx(); + return address; +} + +JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_free_1(JNIEnv *env, + jobject obj, + jlong address, + jlong objectId) { + PmPoolClient *client = reinterpret_cast(objectId); + client->begin_tx(); + int success = client->free(address); + client->end_tx(); + return success; +} + +JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_write_1( + JNIEnv *env, jobject obj, jlong address, jstring data, jlong size, + jlong objectId) { + const char *raw_data = env->GetStringUTFChars(data, 0); + + PmPoolClient *client = reinterpret_cast(objectId); + client->begin_tx(); + int success = client->write(address, raw_data, size); + client->end_tx(); + + env->ReleaseStringUTFChars(data, raw_data); + + return success; +} +JNIEXPORT jlong JNICALL +Java_com_intel_rpmp_PmPoolClient_alloc_1and_1write_1__Ljava_lang_String_2JJ( + JNIEnv *env, jobject obj, jstring data, jlong size, jlong objectId) { + const char *raw_data = env->GetStringUTFChars(data, 0); + + PmPoolClient *client = reinterpret_cast(objectId); + client->begin_tx(); + uint64_t address = client->write(raw_data, size); + client->end_tx(); + + env->ReleaseStringUTFChars(data, raw_data); + return address; +} + +JNIEXPORT jlong JNICALL +Java_com_intel_rpmp_PmPoolClient_alloc_1and_1write_1__Ljava_nio_ByteBuffer_2JJ( + JNIEnv *env, jobject obj, jobject data, jlong size, jlong objectId) { + char *raw_data = static_cast((*env).GetDirectBufferAddress(data)); + PmPoolClient *client = reinterpret_cast(objectId); + client->begin_tx(); + uint64_t address = client->write(raw_data, size); + client->end_tx(); +} + +JNIEXPORT jint JNICALL +Java_com_intel_rpmp_PmPoolClient_put(JNIEnv *env, jobject obj, jstring key, + jobject data, jlong size, jlong objectId) { + char *raw_data = static_cast((*env).GetDirectBufferAddress(data)); + const char *raw_key = env->GetStringUTFChars(key, 0); + PmPoolClient *client = reinterpret_cast(objectId); + client->begin_tx(); + client->put(raw_key, raw_data, size); + client->end_tx(); + env->ReleaseStringUTFChars(key, raw_key); + return 0; +} + +JNIEXPORT jlongArray JNICALL Java_com_intel_rpmp_PmPoolClient_getMeta( + JNIEnv *env, jobject obj, jstring key, jlong objectId) { + const char *raw_key = env->GetStringUTFChars(key, 0); + PmPoolClient *client = reinterpret_cast(objectId); + client->begin_tx(); + auto bml = client->get(raw_key); + client->end_tx(); + env->ReleaseStringUTFChars(key, raw_key); + int longCArraySize = bml.size() * 2; + jlongArray longJavaArray = env->NewLongArray(longCArraySize); + uint64_t *longCArray = + static_cast(std::malloc(longCArraySize * sizeof(uint64_t))); + if (longJavaArray == nullptr) { + return nullptr; + } + int i = 0; + for (auto bm : bml) { + longCArray[i++] = bm.address; + longCArray[i++] = bm.size; + } + env->SetLongArrayRegion(longJavaArray, 0, longCArraySize, + reinterpret_cast(longCArray)); + std::free(longCArray); + env->ReleaseStringUTFChars(key, raw_key); + return longJavaArray; +} + +JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_del(JNIEnv *env, + jobject obj, + jstring key, + jlong objectId) { + const char *raw_key = env->GetStringUTFChars(key, 0); + PmPoolClient *client = reinterpret_cast(objectId); + client->begin_tx(); + int res = client->del(raw_key); + client->end_tx(); + env->ReleaseStringUTFChars(key, raw_key); + return res; +} + +JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_read_1( + JNIEnv *env, jobject obj, jlong address, jlong size, jobject data, + jlong objectId) { + char *raw_data = static_cast((*env).GetDirectBufferAddress(data)); + PmPoolClient *client = reinterpret_cast(objectId); + client->begin_tx(); + int success = client->read(address, raw_data, size); + client->end_tx(); + return success; +} + +JNIEXPORT void JNICALL Java_com_intel_rpmp_PmPoolClient_shutdown_1( + JNIEnv *env, jobject obj, jlong objectId) { + PmPoolClient *client = reinterpret_cast(objectId); + client->shutdown(); +} + +JNIEXPORT void JNICALL Java_com_intel_rpmp_PmPoolClient_waitToStop_1( + JNIEnv *env, jobject obj, jlong objectId) { + PmPoolClient *client = reinterpret_cast(objectId); + client->wait(); +} + +JNIEXPORT void JNICALL Java_com_intel_rpmp_PmPoolClient_dispose_1( + JNIEnv *env, jobject obj, jlong objectId) { + PmPoolClient *client = reinterpret_cast(objectId); + delete client; +} diff --git a/rpmp/pmpool/client/native/com_intel_rpmp_PmPoolClient.h b/rpmp/pmpool/client/native/com_intel_rpmp_PmPoolClient.h new file mode 100644 index 00000000..23f80521 --- /dev/null +++ b/rpmp/pmpool/client/native/com_intel_rpmp_PmPoolClient.h @@ -0,0 +1,140 @@ +/* + * Filename: + * /mnt/spark-pmof/Spark-PMoF/rpmp/pmpool/client/native/com_intel_rpmp_PmPoolClient.h + * Path: /mnt/spark-pmof/Spark-PMoF/rpmp/pmpool/client/native + * Created Date: Thursday, March 5th 2020, 10:44:12 am + * Author: root + * + * Copyright (c) 2020 Intel + */ + +#include +/* Header for class com_intel_rpmp_PmPoolClient */ + +#ifndef PMPOOL_CLIENT_NATIVE_COM_INTEL_RPMP_PMPOOLCLIENT_H_ +#define PMPOOL_CLIENT_NATIVE_COM_INTEL_RPMP_PMPOOLCLIENT_H_ +#ifdef __cplusplus +extern "C" { +#endif +/* + * Class: com_intel_rpmp_PmPoolClient + * Method: newPmPoolClient_ + * Signature: (Ljava/lang/String;Ljava/lang/String;)J + */ +JNIEXPORT jlong JNICALL Java_com_intel_rpmp_PmPoolClient_newPmPoolClient_1( + JNIEnv *, jobject, jstring, jstring); + +/* + * Class: com_intel_rpmp_PmPoolClient + * Method: alloc_ + * Signature: (JJ)J + */ +JNIEXPORT jlong JNICALL Java_com_intel_rpmp_PmPoolClient_alloc_1(JNIEnv *, + jobject, jlong, + jlong); + +/* + * Class: com_intel_rpmp_PmPoolClient + * Method: free_ + * Signature: (JJ)I + */ +JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_free_1(JNIEnv *, + jobject, jlong, + jlong); + +/* + * Class: com_intel_rpmp_PmPoolClient + * Method: write_ + * Signature: (JLjava/lang/String;JJ)I + */ +JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_write_1(JNIEnv *, + jobject, jlong, + jstring, jlong, + jlong); + +/* + * Class: com_intel_rpmp_PmPoolClient + * Method: alloc_and_write_ + * Signature: (Ljava/lang/String;JJ)J + */ +JNIEXPORT jlong JNICALL +Java_com_intel_rpmp_PmPoolClient_alloc_1and_1write_1__Ljava_lang_String_2JJ( + JNIEnv *, jobject, jstring, jlong, jlong); + +/* + * Class: com_intel_rpmp_PmPoolClient + * Method: alloc_and_write_ + * Signature: (Ljava/nio/ByteBuffer;JJ)J + */ +JNIEXPORT jlong JNICALL +Java_com_intel_rpmp_PmPoolClient_alloc_1and_1write_1__Ljava_nio_ByteBuffer_2JJ( + JNIEnv *, jobject, jobject, jlong, jlong); + +/* + * Class: com_intel_rpmp_PmPoolClient + * Method: put + * Signature: (Ljava/lang/String;Ljava/nio/ByteBuffer;JJ)I + */ +JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_put(JNIEnv *, jobject, + jstring, jobject, + jlong, jlong); + +/* + * Class: com_intel_rpmp_PmPoolClient + * Method: getMeta + * Signature: (Ljava/lang/String;J)[J + */ +JNIEXPORT jlongArray JNICALL Java_com_intel_rpmp_PmPoolClient_getMeta(JNIEnv *, + jobject, + jstring, + jlong); + +/* + * Class: com_intel_rpmp_PmPoolClient + * Method: del + * Signature: (Ljava/lang/String;J)I + */ +JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_del(JNIEnv *, jobject, + jstring, jlong); + +/* + * Class: com_intel_rpmp_PmPoolClient + * Method: read_ + * Signature: (JJLjava/nio/ByteBuffer;J)I + */ +JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_read_1(JNIEnv *, + jobject, jlong, + jlong, jobject, + jlong); + +/* + * Class: com_intel_rpmp_PmPoolClient + * Method: shutdown_ + * Signature: (J)V + */ +JNIEXPORT void JNICALL Java_com_intel_rpmp_PmPoolClient_shutdown_1(JNIEnv *, + jobject, + jlong); + +/* + * Class: com_intel_rpmp_PmPoolClient + * Method: waitToStop_ + * Signature: (J)V + */ +JNIEXPORT void JNICALL Java_com_intel_rpmp_PmPoolClient_waitToStop_1(JNIEnv *, + jobject, + jlong); + +/* + * Class: com_intel_rpmp_PmPoolClient + * Method: dispose_ + * Signature: (J)V + */ +JNIEXPORT void JNICALL Java_com_intel_rpmp_PmPoolClient_dispose_1(JNIEnv *, + jobject, + jlong); + +#ifdef __cplusplus +} +#endif +#endif // PMPOOL_CLIENT_NATIVE_COM_INTEL_RPMP_PMPOOLCLIENT_H_ diff --git a/rpmp/pmpool/hash/xxhash.cc b/rpmp/pmpool/hash/xxhash.cc new file mode 100644 index 00000000..c99bc012 --- /dev/null +++ b/rpmp/pmpool/hash/xxhash.cc @@ -0,0 +1,1038 @@ +#pragma clang system_header +#pragma gcc system_header +/* +* xxHash - Fast Hash algorithm +* Copyright (C) 2012-2016, Yann Collet +* +* BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions are +* met: +* +* * Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* * Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following disclaimer +* in the documentation and/or other materials provided with the +* distribution. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +* You can contact the author at : +* - xxHash homepage: http://www.xxhash.com +* - xxHash source repository : https://github.com/Cyan4973/xxHash +*/ + + +/* ************************************* +* Tuning parameters +***************************************/ +/*!XXH_FORCE_MEMORY_ACCESS : +* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. +* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. +* The below switch allow to select different access method for improved performance. +* Method 0 (default) : use `memcpy()`. Safe and portable. +* Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). +* This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. +* Method 2 : direct access. This method doesn't depend on compiler but violate C standard. +* It can generate buggy code on targets which do not support unaligned memory accesses. +* But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) +* See http://stackoverflow.com/a/32095106/646947 for details. +* Prefer these methods in priority order (0 > 1 > 2) +*/ +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \ + || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define XXH_FORCE_MEMORY_ACCESS 2 +# elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ + (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7S__) )) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/*!XXH_ACCEPT_NULL_INPUT_POINTER : +* If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault. +* When this macro is enabled, xxHash actively checks input for null pointer. +* It it is, result for null input pointers is the same as a null-length input. +*/ +#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ +# define XXH_ACCEPT_NULL_INPUT_POINTER 0 +#endif + +/*!XXH_FORCE_NATIVE_FORMAT : +* By default, xxHash library provides endian-independent Hash values, based on little-endian convention. +* Results are therefore identical for little-endian and big-endian CPU. +* This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. +* Should endian-independence be of no importance for your application, you may set the #define below to 1, +* to improve speed for Big-endian CPU. +* This option has no impact on Little_Endian CPU. +*/ +#ifndef XXH_FORCE_NATIVE_FORMAT /* can be defined externally */ +# define XXH_FORCE_NATIVE_FORMAT 0 +#endif + +/*!XXH_FORCE_ALIGN_CHECK : +* This is a minor performance trick, only useful with lots of very small keys. +* It means : check for aligned/unaligned input. +* The check costs one initial branch per hash; +* set it to 0 when the input is guaranteed to be aligned, +* or when alignment doesn't matter for performance. +*/ +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ +# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +/*! Modify the local functions below should you wish to use some other memory routines +* for malloc(), free() */ +#include +static void* XXH_malloc(size_t s) { return malloc(s); } +static void XXH_free(void* p) { free(p); } +/*! and for memcpy() */ +#include +static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest, src, size); } + +#include /* assert */ + +#define XXH_STATIC_LINKING_ONLY +#include "xxhash/xxhash.h" + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# define FORCE_INLINE static __forceinline +#else +# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +# else +# define FORCE_INLINE static +# endif /* __STDC_VERSION__ */ +#endif + + +/* ************************************* +* Basic Types +***************************************/ +#ifndef MEM_MODULE +# if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include +typedef uint8_t BYTE; +typedef uint16_t U16; +typedef uint32_t U32; +# else +typedef unsigned char BYTE; +typedef unsigned short U16; +typedef unsigned int U32; +# endif +#endif + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static U32 XXH_read32(const void* memPtr) { return *(const U32*)memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { U32 u32; } __attribute__((packed)) unalign; +static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } + +#else + +/* portable and safe solution. Generally efficient. +* see : http://stackoverflow.com/a/32095106/646947 +*/ +static U32 XXH_read32(const void* memPtr) +{ + U32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */ +#if defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) +# define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r))) +#endif + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static U32 XXH_swap32(U32 x) +{ + return ((x << 24) & 0xff000000) | + ((x << 8) & 0x00ff0000) | + ((x >> 8) & 0x0000ff00) | + ((x >> 24) & 0x000000ff); +} +#endif + + +/* ************************************* +* Architecture Macros +***************************************/ +typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess; + +/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */ +#ifndef XXH_CPU_LITTLE_ENDIAN +static int XXH_isLittleEndian(void) +{ + const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +#endif + + +/* *************************** +* Memory reads +*****************************/ +typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align == XXH_unaligned) + return endian == XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); + else + return endian == XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr); +} + +FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) +{ + return XXH_readLE32_align(ptr, endian, XXH_unaligned); +} + +static U32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} + + +/* ************************************* +* Macros +***************************************/ +#define XXH_STATIC_ASSERT(c) { enum { XXH_sa = 1/(int)(!!(c)) }; } /* use after variable declarations */ +XXH_PUBLIC_API unsigned XXH_versionNumber(void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +static const U32 PRIME32_1 = 2654435761U; /* 0b10011110001101110111100110110001 */ +static const U32 PRIME32_2 = 2246822519U; /* 0b10000101111010111100101001110111 */ +static const U32 PRIME32_3 = 3266489917U; /* 0b11000010101100101010111000111101 */ +static const U32 PRIME32_4 = 668265263U; /* 0b00100111110101001110101100101111 */ +static const U32 PRIME32_5 = 374761393U; /* 0b00010110010101100110011110110001 */ + +static U32 XXH32_round(U32 seed, U32 input) +{ + seed += input * PRIME32_2; + seed = XXH_rotl32(seed, 13); + seed *= PRIME32_1; + return seed; +} + +/* mix all bits */ +static U32 XXH32_avalanche(U32 h32) +{ + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + return(h32); +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) + +static U32 +XXH32_finalize(U32 h32, const void* ptr, size_t len, + XXH_endianess endian, XXH_alignment align) + +{ + const BYTE* p = (const BYTE*)ptr; + +#define PROCESS1 \ + h32 += (*p++) * PRIME32_5; \ + h32 = XXH_rotl32(h32, 11) * PRIME32_1 ; + +#define PROCESS4 \ + h32 += XXH_get32bits(p) * PRIME32_3; \ + p+=4; \ + h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; + + switch (len & 15) /* or switch(bEnd - p) */ + { + case 12: PROCESS4; + /* fallthrough */ + case 8: PROCESS4; + /* fallthrough */ + case 4: PROCESS4; + return XXH32_avalanche(h32); + + case 13: PROCESS4; + /* fallthrough */ + case 9: PROCESS4; + /* fallthrough */ + case 5: PROCESS4; + PROCESS1; + return XXH32_avalanche(h32); + + case 14: PROCESS4; + /* fallthrough */ + case 10: PROCESS4; + /* fallthrough */ + case 6: PROCESS4; + PROCESS1; + PROCESS1; + return XXH32_avalanche(h32); + + case 15: PROCESS4; + /* fallthrough */ + case 11: PROCESS4; + /* fallthrough */ + case 7: PROCESS4; + /* fallthrough */ + case 3: PROCESS1; + /* fallthrough */ + case 2: PROCESS1; + /* fallthrough */ + case 1: PROCESS1; + /* fallthrough */ + case 0: return XXH32_avalanche(h32); + } + assert(0); + return h32; /* reaching this point is deemed impossible */ +} + + +FORCE_INLINE U32 +XXH32_endian_align(const void* input, size_t len, U32 seed, + XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* bEnd = p + len; + U32 h32; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (p == NULL) { + len = 0; + bEnd = p = (const BYTE*)(size_t)16; + } +#endif + + if (len >= 16) { + const BYTE* const limit = bEnd - 15; + U32 v1 = seed + PRIME32_1 + PRIME32_2; + U32 v2 = seed + PRIME32_2; + U32 v3 = seed + 0; + U32 v4 = seed - PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(p)); p += 4; + v2 = XXH32_round(v2, XXH_get32bits(p)); p += 4; + v3 = XXH32_round(v3, XXH_get32bits(p)); p += 4; + v4 = XXH32_round(v4, XXH_get32bits(p)); p += 4; + } while (p < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } + else { + h32 = seed + PRIME32_5; + } + + h32 += (U32)len; + + return XXH32_finalize(h32, p, len & 15, endian, align); +} + + +XXH_PUBLIC_API unsigned int XXH32(const void* input, size_t len, unsigned int seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, input, len); + return XXH32_digest(&state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } + } + + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + + + +/*====== Hash streaming ======*/ + +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed) +{ + XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME32_1 + PRIME32_2; + state.v2 = seed + PRIME32_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME32_1; + /* do not write into reserved, planned to be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); + return XXH_OK; +} + + +FORCE_INLINE XXH_errorcode +XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian) +{ + if (input == NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + + state->total_len_32 += (unsigned)len; + state->large_len |= (len >= 16) | (state->total_len_32 >= 16); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); + state->memsize += (unsigned)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16 - state->memsize); + { const U32* p32 = state->mem32; + state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++; + state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++; + state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++; + state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); + } + p += 16 - state->memsize; + state->memsize = 0; + } + + if (p <= bEnd - 16) { + const BYTE* const limit = bEnd - 16; + U32 v1 = state->v1; + U32 v2 = state->v2; + U32 v3 = state->v3; + U32 v4 = state->v4; + + do { + v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p += 4; + v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p += 4; + v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p += 4; + v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p += 4; + } while (p <= limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd - p)); + state->memsize = (unsigned)(bEnd - p); + } + } + + return XXH_OK; +} + + +XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t* state_in, const void* input, size_t len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH32_update_endian(state_in, input, len, XXH_bigEndian); +} + + +FORCE_INLINE U32 +XXH32_digest_endian(const XXH32_state_t* state, XXH_endianess endian) +{ + U32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v1, 1) + + XXH_rotl32(state->v2, 7) + + XXH_rotl32(state->v3, 12) + + XXH_rotl32(state->v4, 18); + } + else { + h32 = state->v3 /* == seed */ + PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, state->mem32, state->memsize, endian, XXH_aligned); +} + + +XXH_PUBLIC_API unsigned int XXH32_digest(const XXH32_state_t* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_digest_endian(state_in, XXH_littleEndian); + else + return XXH32_digest_endian(state_in, XXH_bigEndian); +} + + +/*====== Canonical representation ======*/ + +/*! Default XXH result types are basic unsigned 32 and 64 bits. +* The canonical representation follows human-readable write convention, aka big-endian (large digits first). +* These functions allow transformation of hash result into and from its canonical format. +* This way, hash values can be written into a file or buffer, remaining comparable across different systems. +*/ + +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ + +/*====== Memory access ======*/ + +#ifndef MEM_MODULE +# define MEM_MODULE +# if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include +typedef uint64_t U64; +# else +/* if compiler doesn't support unsigned long long, replace by another 64-bit type */ +typedef unsigned long long U64; +# endif +#endif + + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static U64 XXH_read64(const void* memPtr) { return *(const U64*)memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign64; +static U64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; } + +#else + +/* portable and safe solution. Generally efficient. +* see : http://stackoverflow.com/a/32095106/646947 +*/ + +static U64 XXH_read64(const void* memPtr) +{ + U64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static U64 XXH_swap64(U64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + +FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align == XXH_unaligned) + return endian == XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); + else + return endian == XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr); +} + +FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) +{ + return XXH_readLE64_align(ptr, endian, XXH_unaligned); +} + +static U64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} + + +/*====== xxh64 ======*/ + +static const U64 PRIME64_1 = 11400714785074694791ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */ +static const U64 PRIME64_2 = 14029467366897019727ULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */ +static const U64 PRIME64_3 = 1609587929392839161ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */ +static const U64 PRIME64_4 = 9650029242287828579ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */ +static const U64 PRIME64_5 = 2870177450012600261ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +static U64 XXH64_round(U64 acc, U64 input) +{ + acc += input * PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= PRIME64_1; + return acc; +} + +static U64 XXH64_mergeRound(U64 acc, U64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * PRIME64_1 + PRIME64_4; + return acc; +} + +static U64 XXH64_avalanche(U64 h64) +{ + h64 ^= h64 >> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + return h64; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align) + +static U64 +XXH64_finalize(U64 h64, const void* ptr, size_t len, + XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)ptr; + +#define PROCESS1_64 \ + h64 ^= (*p++) * PRIME64_5; \ + h64 = XXH_rotl64(h64, 11) * PRIME64_1; + +#define PROCESS4_64 \ + h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; \ + p+=4; \ + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + +#define PROCESS8_64 { \ + U64 const k1 = XXH64_round(0, XXH_get64bits(p)); \ + p+=8; \ + h64 ^= k1; \ + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \ +} + + switch (len & 31) { + case 24: PROCESS8_64; + /* fallthrough */ + case 16: PROCESS8_64; + /* fallthrough */ + case 8: PROCESS8_64; + return XXH64_avalanche(h64); + + case 28: PROCESS8_64; + /* fallthrough */ + case 20: PROCESS8_64; + /* fallthrough */ + case 12: PROCESS8_64; + /* fallthrough */ + case 4: PROCESS4_64; + return XXH64_avalanche(h64); + + case 25: PROCESS8_64; + /* fallthrough */ + case 17: PROCESS8_64; + /* fallthrough */ + case 9: PROCESS8_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 29: PROCESS8_64; + /* fallthrough */ + case 21: PROCESS8_64; + /* fallthrough */ + case 13: PROCESS8_64; + /* fallthrough */ + case 5: PROCESS4_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 26: PROCESS8_64; + /* fallthrough */ + case 18: PROCESS8_64; + /* fallthrough */ + case 10: PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 30: PROCESS8_64; + /* fallthrough */ + case 22: PROCESS8_64; + /* fallthrough */ + case 14: PROCESS8_64; + /* fallthrough */ + case 6: PROCESS4_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 27: PROCESS8_64; + /* fallthrough */ + case 19: PROCESS8_64; + /* fallthrough */ + case 11: PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 31: PROCESS8_64; + /* fallthrough */ + case 23: PROCESS8_64; + /* fallthrough */ + case 15: PROCESS8_64; + /* fallthrough */ + case 7: PROCESS4_64; + /* fallthrough */ + case 3: PROCESS1_64; + /* fallthrough */ + case 2: PROCESS1_64; + /* fallthrough */ + case 1: PROCESS1_64; + /* fallthrough */ + case 0: return XXH64_avalanche(h64); + } + + /* impossible to reach */ + assert(0); + return 0; /* unreachable, but some compilers complain without it */ +} + +FORCE_INLINE U64 +XXH64_endian_align(const void* input, size_t len, U64 seed, + XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* bEnd = p + len; + U64 h64; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (p == NULL) { + len = 0; + bEnd = p = (const BYTE*)(size_t)32; + } +#endif + + if (len >= 32) { + const BYTE* const limit = bEnd - 32; + U64 v1 = seed + PRIME64_1 + PRIME64_2; + U64 v2 = seed + PRIME64_2; + U64 v3 = seed + 0; + U64 v4 = seed - PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(p)); p += 8; + v2 = XXH64_round(v2, XXH_get64bits(p)); p += 8; + v3 = XXH64_round(v3, XXH_get64bits(p)); p += 8; + v4 = XXH64_round(v4, XXH_get64bits(p)); p += 8; + } while (p <= limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + + } + else { + h64 = seed + PRIME64_5; + } + + h64 += (U64)len; + + return XXH64_finalize(h64, p, len, endian, align); +} + + +XXH_PUBLIC_API unsigned long long XXH64(const void* input, size_t len, unsigned long long seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, input, len); + return XXH64_digest(&state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7) == 0) { /* Input is aligned, let's leverage the speed advantage */ + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } + } + + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + +/*====== Hash Streaming ======*/ + +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed) +{ + XXH64_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME64_1 + PRIME64_2; + state.v2 = seed + PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME64_1; + /* do not write into reserved, planned to be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); + return XXH_OK; +} + +FORCE_INLINE XXH_errorcode +XXH64_update_endian(XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian) +{ + if (input == NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len); + state->memsize += (U32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32 - state->memsize); + state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64 + 0, endian)); + state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64 + 1, endian)); + state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64 + 2, endian)); + state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64 + 3, endian)); + p += 32 - state->memsize; + state->memsize = 0; + } + + if (p + 32 <= bEnd) { + const BYTE* const limit = bEnd - 32; + U64 v1 = state->v1; + U64 v2 = state->v2; + U64 v3 = state->v3; + U64 v4 = state->v4; + + do { + v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p += 8; + v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p += 8; + v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p += 8; + v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p += 8; + } while (p <= limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd - p)); + state->memsize = (unsigned)(bEnd - p); + } + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t* state_in, const void* input, size_t len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH64_update_endian(state_in, input, len, XXH_bigEndian); +} + +FORCE_INLINE U64 XXH64_digest_endian(const XXH64_state_t* state, XXH_endianess endian) +{ + U64 h64; + + if (state->total_len >= 32) { + U64 const v1 = state->v1; + U64 const v2 = state->v2; + U64 const v3 = state->v3; + U64 const v4 = state->v4; + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + } + else { + h64 = state->v3 /*seed*/ + PRIME64_5; + } + + h64 += (U64)state->total_len; + + return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian, XXH_aligned); +} + +XXH_PUBLIC_API unsigned long long XXH64_digest(const XXH64_state_t* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_digest_endian(state_in, XXH_littleEndian); + else + return XXH64_digest_endian(state_in, XXH_bigEndian); +} + + +/*====== Canonical representation ======*/ + +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + +#endif diff --git a/rpmp/pmpool/queue/blockingconcurrentqueue.h b/rpmp/pmpool/queue/blockingconcurrentqueue.h new file mode 100644 index 00000000..c855f9df --- /dev/null +++ b/rpmp/pmpool/queue/blockingconcurrentqueue.h @@ -0,0 +1,981 @@ +// Provides an efficient blocking version of moodycamel::ConcurrentQueue. +// ©2015-2016 Cameron Desrochers. Distributed under the terms of the simplified +// BSD license, available at the top of concurrentqueue.h. +// Uses Jeff Preshing's semaphore implementation (under the terms of its +// separate zlib license, embedded below). + +#pragma once + +#include "concurrentqueue.h" +#include +#include +#include +#include +#include + +#if defined(_WIN32) +// Avoid including windows.h in a header; we only need a handful of +// items, so we'll redeclare them here (this is relatively safe since +// the API generally has to remain stable between Windows versions). +// I know this is an ugly hack but it still beats polluting the global +// namespace with thousands of generic names or adding a .cpp for nothing. +extern "C" { + struct _SECURITY_ATTRIBUTES; + __declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName); + __declspec(dllimport) int __stdcall CloseHandle(void* hObject); + __declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds); + __declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount); +} +#elif defined(__MACH__) +#include +#elif defined(__unix__) +#include +#endif + +namespace moodycamel +{ +namespace details +{ + // Code in the mpmc_sema namespace below is an adaptation of Jeff Preshing's + // portable + lightweight semaphore implementations, originally from + // https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h + // LICENSE: + // Copyright (c) 2015 Jeff Preshing + // + // This software is provided 'as-is', without any express or implied + // warranty. In no event will the authors be held liable for any damages + // arising from the use of this software. + // + // Permission is granted to anyone to use this software for any purpose, + // including commercial applications, and to alter it and redistribute it + // freely, subject to the following restrictions: + // + // 1. The origin of this software must not be misrepresented; you must not + // claim that you wrote the original software. If you use this software + // in a product, an acknowledgement in the product documentation would be + // appreciated but is not required. + // 2. Altered source versions must be plainly marked as such, and must not be + // misrepresented as being the original software. + // 3. This notice may not be removed or altered from any source distribution. + namespace mpmc_sema + { +#if defined(_WIN32) + class Semaphore + { + private: + void* m_hSema; + + Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + + public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + const long maxLong = 0x7fffffff; + m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr); + } + + ~Semaphore() + { + CloseHandle(m_hSema); + } + + void wait() + { + const unsigned long infinite = 0xffffffff; + WaitForSingleObject(m_hSema, infinite); + } + + bool try_wait() + { + const unsigned long RC_WAIT_TIMEOUT = 0x00000102; + return WaitForSingleObject(m_hSema, 0) != RC_WAIT_TIMEOUT; + } + + bool timed_wait(std::uint64_t usecs) + { + const unsigned long RC_WAIT_TIMEOUT = 0x00000102; + return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) != RC_WAIT_TIMEOUT; + } + + void signal(int count = 1) + { + ReleaseSemaphore(m_hSema, count, nullptr); + } + }; +#elif defined(__MACH__) + //--------------------------------------------------------- + // Semaphore (Apple iOS and OSX) + // Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html + //--------------------------------------------------------- + class Semaphore + { + private: + semaphore_t m_sema; + + Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + + public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount); + } + + ~Semaphore() + { + semaphore_destroy(mach_task_self(), m_sema); + } + + void wait() + { + semaphore_wait(m_sema); + } + + bool try_wait() + { + return timed_wait(0); + } + + bool timed_wait(std::uint64_t timeout_usecs) + { + mach_timespec_t ts; + ts.tv_sec = static_cast(timeout_usecs / 1000000); + ts.tv_nsec = (timeout_usecs % 1000000) * 1000; + + // added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html + kern_return_t rc = semaphore_timedwait(m_sema, ts); + + return rc != KERN_OPERATION_TIMED_OUT && rc != KERN_ABORTED; + } + + void signal() + { + semaphore_signal(m_sema); + } + + void signal(int count) + { + while (count-- > 0) + { + semaphore_signal(m_sema); + } + } + }; +#elif defined(__unix__) + //--------------------------------------------------------- + // Semaphore (POSIX, Linux) + //--------------------------------------------------------- + class Semaphore + { + private: + sem_t m_sema; + + Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + + public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + sem_init(&m_sema, 0, initialCount); + } + + ~Semaphore() + { + sem_destroy(&m_sema); + } + + void wait() + { + // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error + int rc; + do { + rc = sem_wait(&m_sema); + } while (rc == -1 && errno == EINTR); + } + + bool try_wait() + { + int rc; + do { + rc = sem_trywait(&m_sema); + } while (rc == -1 && errno == EINTR); + return !(rc == -1 && errno == EAGAIN); + } + + bool timed_wait(std::uint64_t usecs) + { + struct timespec ts; + const int usecs_in_1_sec = 1000000; + const int nsecs_in_1_sec = 1000000000; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += usecs / usecs_in_1_sec; + ts.tv_nsec += (usecs % usecs_in_1_sec) * 1000; + // sem_timedwait bombs if you have more than 1e9 in tv_nsec + // so we have to clean things up before passing it in + if (ts.tv_nsec >= nsecs_in_1_sec) { + ts.tv_nsec -= nsecs_in_1_sec; + ++ts.tv_sec; + } + + int rc; + do { + rc = sem_timedwait(&m_sema, &ts); + } while (rc == -1 && errno == EINTR); + return !(rc == -1 && errno == ETIMEDOUT); + } + + void signal() + { + sem_post(&m_sema); + } + + void signal(int count) + { + while (count-- > 0) + { + sem_post(&m_sema); + } + } + }; +#else +#error Unsupported platform! (No semaphore wrapper available) +#endif + + //--------------------------------------------------------- + // LightweightSemaphore + //--------------------------------------------------------- + class LightweightSemaphore + { + public: + typedef std::make_signed::type ssize_t; + + private: + std::atomic m_count; + Semaphore m_sema; + + bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1) + { + ssize_t oldCount; + // Is there a better way to set the initial spin count? + // If we lower it to 1000, testBenaphore becomes 15x slower on my Core i7-5930K Windows PC, + // as threads start hitting the kernel semaphore. + int spin = 10000; + while (--spin >= 0) + { + oldCount = m_count.load(std::memory_order_relaxed); + if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed)) + return true; + std::atomic_signal_fence(std::memory_order_acquire); // Prevent the compiler from collapsing the loop. + } + oldCount = m_count.fetch_sub(1, std::memory_order_acquire); + if (oldCount > 0) + return true; + if (timeout_usecs < 0) + { + m_sema.wait(); + return true; + } + if (m_sema.timed_wait((std::uint64_t)timeout_usecs)) + return true; + // At this point, we've timed out waiting for the semaphore, but the + // count is still decremented indicating we may still be waiting on + // it. So we have to re-adjust the count, but only if the semaphore + // wasn't signaled enough times for us too since then. If it was, we + // need to release the semaphore too. + while (true) + { + oldCount = m_count.load(std::memory_order_acquire); + if (oldCount >= 0 && m_sema.try_wait()) + return true; + if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed)) + return false; + } + } + + ssize_t waitManyWithPartialSpinning(ssize_t max, std::int64_t timeout_usecs = -1) + { + assert(max > 0); + ssize_t oldCount; + int spin = 10000; + while (--spin >= 0) + { + oldCount = m_count.load(std::memory_order_relaxed); + if (oldCount > 0) + { + ssize_t newCount = oldCount > max ? oldCount - max : 0; + if (m_count.compare_exchange_strong(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed)) + return oldCount - newCount; + } + std::atomic_signal_fence(std::memory_order_acquire); + } + oldCount = m_count.fetch_sub(1, std::memory_order_acquire); + if (oldCount <= 0) + { + if (timeout_usecs < 0) + m_sema.wait(); + else if (!m_sema.timed_wait((std::uint64_t)timeout_usecs)) + { + while (true) + { + oldCount = m_count.load(std::memory_order_acquire); + if (oldCount >= 0 && m_sema.try_wait()) + break; + if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed)) + return 0; + } + } + } + if (max > 1) + return 1 + tryWaitMany(max - 1); + return 1; + } + + public: + LightweightSemaphore(ssize_t initialCount = 0) : m_count(initialCount) + { + assert(initialCount >= 0); + } + + bool tryWait() + { + ssize_t oldCount = m_count.load(std::memory_order_relaxed); + while (oldCount > 0) + { + if (m_count.compare_exchange_weak(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed)) + return true; + } + return false; + } + + void wait() + { + if (!tryWait()) + waitWithPartialSpinning(); + } + + bool wait(std::int64_t timeout_usecs) + { + return tryWait() || waitWithPartialSpinning(timeout_usecs); + } + + // Acquires between 0 and (greedily) max, inclusive + ssize_t tryWaitMany(ssize_t max) + { + assert(max >= 0); + ssize_t oldCount = m_count.load(std::memory_order_relaxed); + while (oldCount > 0) + { + ssize_t newCount = oldCount > max ? oldCount - max : 0; + if (m_count.compare_exchange_weak(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed)) + return oldCount - newCount; + } + return 0; + } + + // Acquires at least one, and (greedily) at most max + ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs) + { + assert(max >= 0); + ssize_t result = tryWaitMany(max); + if (result == 0 && max > 0) + result = waitManyWithPartialSpinning(max, timeout_usecs); + return result; + } + + ssize_t waitMany(ssize_t max) + { + ssize_t result = waitMany(max, -1); + assert(result > 0); + return result; + } + + void signal(ssize_t count = 1) + { + assert(count >= 0); + ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release); + ssize_t toRelease = -oldCount < count ? -oldCount : count; + if (toRelease > 0) + { + m_sema.signal((int)toRelease); + } + } + + ssize_t availableApprox() const + { + ssize_t count = m_count.load(std::memory_order_relaxed); + return count > 0 ? count : 0; + } + }; + } // end namespace mpmc_sema +} // end namespace details + + +// This is a blocking version of the queue. It has an almost identical interface to +// the normal non-blocking version, with the addition of various wait_dequeue() methods +// and the removal of producer-specific dequeue methods. +template +class BlockingConcurrentQueue +{ +private: + typedef ::moodycamel::ConcurrentQueue ConcurrentQueue; + typedef details::mpmc_sema::LightweightSemaphore LightweightSemaphore; + +public: + typedef typename ConcurrentQueue::producer_token_t producer_token_t; + typedef typename ConcurrentQueue::consumer_token_t consumer_token_t; + + typedef typename ConcurrentQueue::index_t index_t; + typedef typename ConcurrentQueue::size_t size_t; + typedef typename std::make_signed::type ssize_t; + + static const size_t BLOCK_SIZE = ConcurrentQueue::BLOCK_SIZE; + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = ConcurrentQueue::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD; + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::EXPLICIT_INITIAL_INDEX_SIZE; + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::IMPLICIT_INITIAL_INDEX_SIZE; + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = ConcurrentQueue::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = ConcurrentQueue::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE; + static const size_t MAX_SUBQUEUE_SIZE = ConcurrentQueue::MAX_SUBQUEUE_SIZE; + +public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be allocated + // up-front, which means only a single producer will be able to enqueue elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads (this + // includes making the memory effects of construction visible, possibly with a + // memory barrier). + explicit BlockingConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) + : inner(capacity), sema(create(), &BlockingConcurrentQueue::template destroy) + { + assert(reinterpret_cast((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member"); + if (!sema) { + MOODYCAMEL_THROW(std::bad_alloc()); + } + } + + BlockingConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) + : inner(minCapacity, maxExplicitProducers, maxImplicitProducers), sema(create(), &BlockingConcurrentQueue::template destroy) + { + assert(reinterpret_cast((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member"); + if (!sema) { + MOODYCAMEL_THROW(std::bad_alloc()); + } + } + + // Disable copying and copy assignment + BlockingConcurrentQueue(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + BlockingConcurrentQueue& operator=(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + BlockingConcurrentQueue(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + : inner(std::move(other.inner)), sema(std::move(other.sema)) + { } + + inline BlockingConcurrentQueue& operator=(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + { + return swap_internal(other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(BlockingConcurrentQueue& other) MOODYCAMEL_NOEXCEPT + { + swap_internal(other); + } + +private: + BlockingConcurrentQueue& swap_internal(BlockingConcurrentQueue& other) + { + if (this == &other) { + return *this; + } + + inner.swap(other.inner); + sema.swap(other.sema); + return *this; + } + +public: + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const& item) + { + if ((details::likely)(inner.enqueue(item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T&& item) + { + if ((details::likely)(inner.enqueue(std::move(item)))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T const& item) + { + if ((details::likely)(inner.enqueue(token, item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T&& item) + { + if ((details::likely)(inner.enqueue(token, std::move(item)))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved instead of copied. + // Thread-safe. + template + inline bool enqueue_bulk(It itemFirst, size_t count) + { + if ((details::likely)(inner.enqueue_bulk(std::forward(itemFirst), count))) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + inline bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + if ((details::likely)(inner.enqueue_bulk(token, std::forward(itemFirst), count))) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0). + // Thread-safe. + inline bool try_enqueue(T const& item) + { + if (inner.try_enqueue(item)) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T&& item) + { + if (inner.try_enqueue(std::move(item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T const& item) + { + if (inner.try_enqueue(token, item)) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T&& item) + { + if (inner.try_enqueue(token, std::move(item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + inline bool try_enqueue_bulk(It itemFirst, size_t count) + { + if (inner.try_enqueue_bulk(std::forward(itemFirst), count)) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + inline bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + if (inner.try_enqueue_bulk(token, std::forward(itemFirst), count)) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue(U& item) + { + if (sema->tryWait()) { + while (!inner.try_dequeue(item)) { + continue; + } + return true; + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue(consumer_token_t& token, U& item) + { + if (sema->tryWait()) { + while (!inner.try_dequeue(token, item)) { + continue; + } + return true; + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(token, itemFirst, max - count); + } + return count; + } + + + + // Blocks the current thread until there's something to dequeue, then + // dequeues it. + // Never allocates. Thread-safe. + template + inline void wait_dequeue(U& item) + { + sema->wait(); + while (!inner.try_dequeue(item)) { + continue; + } + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout (specified in microseconds) expires. Returns false + // without setting `item` if the timeout expires, otherwise assigns + // to `item` and returns true. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(U& item, std::int64_t timeout_usecs) + { + if (!sema->wait(timeout_usecs)) { + return false; + } + while (!inner.try_dequeue(item)) { + continue; + } + return true; + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout expires. Returns false without setting `item` if the + // timeout expires, otherwise assigns to `item` and returns true. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(U& item, std::chrono::duration const& timeout) + { + return wait_dequeue_timed(item, std::chrono::duration_cast(timeout).count()); + } + + // Blocks the current thread until there's something to dequeue, then + // dequeues it using an explicit consumer token. + // Never allocates. Thread-safe. + template + inline void wait_dequeue(consumer_token_t& token, U& item) + { + sema->wait(); + while (!inner.try_dequeue(token, item)) { + continue; + } + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout (specified in microseconds) expires. Returns false + // without setting `item` if the timeout expires, otherwise assigns + // to `item` and returns true. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::int64_t timeout_usecs) + { + if (!sema->wait(timeout_usecs)) { + return false; + } + while (!inner.try_dequeue(token, item)) { + continue; + } + return true; + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout expires. Returns false without setting `item` if the + // timeout expires, otherwise assigns to `item` and returns true. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::chrono::duration const& timeout) + { + return wait_dequeue_timed(token, item, std::chrono::duration_cast(timeout).count()); + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued, which will + // always be at least one (this method blocks until the queue + // is non-empty) and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue_bulk. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::int64_t timeout_usecs) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs); + while (count != max) { + count += inner.template try_dequeue_bulk(itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::chrono::duration const& timeout) + { + return wait_dequeue_bulk_timed(itemFirst, max, std::chrono::duration_cast(timeout).count()); + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued, which will + // always be at least one (this method blocks until the queue + // is non-empty) and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(token, itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue_bulk. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::int64_t timeout_usecs) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs); + while (count != max) { + count += inner.template try_dequeue_bulk(token, itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::chrono::duration const& timeout) + { + return wait_dequeue_bulk_timed(token, itemFirst, max, std::chrono::duration_cast(timeout).count()); + } + + + // Returns an estimate of the total number of elements currently in the queue. This + // estimate is only accurate if the queue has completely stabilized before it is called + // (i.e. all enqueue and dequeue operations have completed and their memory effects are + // visible on the calling thread, and no further operations start while this method is + // being called). + // Thread-safe. + inline size_t size_approx() const + { + return (size_t)sema->availableApprox(); + } + + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static bool is_lock_free() + { + return ConcurrentQueue::is_lock_free(); + } + + +private: + template + static inline U* create() + { + auto p = (Traits::malloc)(sizeof(U)); + return p != nullptr ? new (p) U : nullptr; + } + + template + static inline U* create(A1&& a1) + { + auto p = (Traits::malloc)(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; + } + + template + static inline void destroy(U* p) + { + if (p != nullptr) { + p->~U(); + } + (Traits::free)(p); + } + +private: + ConcurrentQueue inner; + std::unique_ptr sema; +}; + + +template +inline void swap(BlockingConcurrentQueue& a, BlockingConcurrentQueue& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +} // end namespace moodycamel diff --git a/rpmp/pmpool/queue/concurrentqueue.h b/rpmp/pmpool/queue/concurrentqueue.h new file mode 100644 index 00000000..21cb9375 --- /dev/null +++ b/rpmp/pmpool/queue/concurrentqueue.h @@ -0,0 +1,3636 @@ +// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue. +// An overview, including benchmark results, is provided here: +// http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ +// The full design is also described in excruciating detail at: +// http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue + +// Simplified BSD license: +// Copyright (c) 2013-2016, Cameron Desrochers. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, this list of +// conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, this list of +// conditions and the following disclaimer in the documentation and/or other materials +// provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +#pragma once + +#if defined(__GNUC__) +// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and +// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings +// upon assigning any computed values) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" + +#ifdef MCDBGQ_USE_RELACY +#pragma GCC diagnostic ignored "-Wint-to-pointer-cast" +#endif +#endif + +#if defined(__APPLE__) +#include "TargetConditionals.h" +#endif + +#ifdef MCDBGQ_USE_RELACY +#include "relacy/relacy_std.hpp" +#include "relacy_shims.h" +// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations. +// We'll override the default trait malloc ourselves without a macro. +#undef new +#undef delete +#undef malloc +#undef free +#else +#include // Requires C++11. Sorry VS2010. +#include +#endif +#include // for max_align_t +#include +#include +#include +#include +#include +#include +#include // for CHAR_BIT +#include +#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading + +// Platform-specific definitions of a numeric thread ID type and an invalid value +namespace moodycamel { namespace details { + template struct thread_id_converter { + typedef thread_id_t thread_id_numeric_size_t; + typedef thread_id_t thread_id_hash_t; + static thread_id_hash_t prehash(thread_id_t const& x) { return x; } + }; +} } +#if defined(MCDBGQ_USE_RELACY) +namespace moodycamel { namespace details { + typedef std::uint32_t thread_id_t; + static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; + static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; + static inline thread_id_t thread_id() { return rl::thread_index(); } +} } +#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) +// No sense pulling in windows.h in a header, we'll manually declare the function +// we use and rely on backwards-compatibility for this not to break +extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void); +namespace moodycamel { namespace details { + static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows"); + typedef std::uint32_t thread_id_t; + static const thread_id_t invalid_thread_id = 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx + static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4. + static inline thread_id_t thread_id() { return static_cast(::GetCurrentThreadId()); } +} } +#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) +namespace moodycamel { namespace details { + static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes"); + + typedef std::thread::id thread_id_t; + static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID + + // Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's + // only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't + // be. + static inline thread_id_t thread_id() { return std::this_thread::get_id(); } + + template struct thread_id_size { }; + template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; }; + template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; }; + + template<> struct thread_id_converter { + typedef thread_id_size::numeric_t thread_id_numeric_size_t; +#ifndef __APPLE__ + typedef std::size_t thread_id_hash_t; +#else + typedef thread_id_numeric_size_t thread_id_hash_t; +#endif + + static thread_id_hash_t prehash(thread_id_t const& x) + { +#ifndef __APPLE__ + return std::hash()(x); +#else + return *reinterpret_cast(&x); +#endif + } + }; +} } +#else +// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 +// In order to get a numeric thread ID in a platform-independent way, we use a thread-local +// static variable's address as a thread identifier :-) +#if defined(__GNUC__) || defined(__INTEL_COMPILER) +#define MOODYCAMEL_THREADLOCAL __thread +#elif defined(_MSC_VER) +#define MOODYCAMEL_THREADLOCAL __declspec(thread) +#else +// Assume C++11 compliant compiler +#define MOODYCAMEL_THREADLOCAL thread_local +#endif +namespace moodycamel { namespace details { + typedef std::uintptr_t thread_id_t; + static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr + static const thread_id_t invalid_thread_id2 = 1; // Member accesses off a null pointer are also generally invalid. Plus it's not aligned. + static inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast(&x); } +} } +#endif + +// Exceptions +#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED +#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__)) +#define MOODYCAMEL_EXCEPTIONS_ENABLED +#endif +#endif +#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED +#define MOODYCAMEL_TRY try +#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__) +#define MOODYCAMEL_RETHROW throw +#define MOODYCAMEL_THROW(expr) throw (expr) +#else +#define MOODYCAMEL_TRY if (true) +#define MOODYCAMEL_CATCH(...) else if (false) +#define MOODYCAMEL_RETHROW +#define MOODYCAMEL_THROW(expr) +#endif + +#ifndef MOODYCAMEL_NOEXCEPT +#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED) +#define MOODYCAMEL_NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 +// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-( +// We have to assume *all* non-trivial constructors may throw on VS2012! +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value : std::is_trivially_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value || std::is_nothrow_move_constructible::value : std::is_trivially_copy_constructible::value || std::is_nothrow_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#else +#define MOODYCAMEL_NOEXCEPT noexcept +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr) +#endif +#endif + +#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY +#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#else +// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 +// g++ <=4.7 doesn't support thread_local either. +// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work +#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) +// Assume `thread_local` is fully supported in all other C++11 compilers/platforms +//#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // always disabled for now since several users report having problems with it on +#endif +#endif +#endif + +// VS2012 doesn't support deleted functions. +// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called. +#ifndef MOODYCAMEL_DELETE_FUNCTION +#if defined(_MSC_VER) && _MSC_VER < 1800 +#define MOODYCAMEL_DELETE_FUNCTION +#else +#define MOODYCAMEL_DELETE_FUNCTION = delete +#endif +#endif + +// Compiler-specific likely/unlikely hints +namespace moodycamel { namespace details { +#if defined(__GNUC__) + static inline bool (likely)(bool x) { return __builtin_expect((x), true); } + static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); } +#else + static inline bool (likely)(bool x) { return x; } + static inline bool (unlikely)(bool x) { return x; } +#endif +} } + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG +#include "internal/concurrentqueue_internal_debug.h" +#endif + +namespace moodycamel { +namespace details { + template + struct const_numeric_max { + static_assert(std::is_integral::value, "const_numeric_max can only be used with integers"); + static const T value = std::numeric_limits::is_signed + ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast(1) + : static_cast(-1); + }; + +#if defined(__GLIBCXX__) + typedef ::max_align_t std_max_align_t; // libstdc++ forgot to add it to std:: for a while +#else + typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: +#endif + + // Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting + // 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. + typedef union { + std_max_align_t x; + long long y; + void* z; + } max_align_t; +} + +// Default traits for the ConcurrentQueue. To change some of the +// traits without re-implementing all of them, inherit from this +// struct and shadow the declarations you wish to be different; +// since the traits are used as a template type parameter, the +// shadowed declarations will be used where defined, and the defaults +// otherwise. +struct ConcurrentQueueDefaultTraits +{ + // General-purpose size type. std::size_t is strongly recommended. + typedef std::size_t size_t; + + // The type used for the enqueue and dequeue indices. Must be at least as + // large as size_t. Should be significantly larger than the number of elements + // you expect to hold at once, especially if you have a high turnover rate; + // for example, on 32-bit x86, if you expect to have over a hundred million + // elements or pump several million elements through your queue in a very + // short space of time, using a 32-bit type *may* trigger a race condition. + // A 64-bit int type is recommended in that case, and in practice will + // prevent a race condition no matter the usage of the queue. Note that + // whether the queue is lock-free with a 64-int type depends on the whether + // std::atomic is lock-free, which is platform-specific. + typedef std::size_t index_t; + + // Internally, all elements are enqueued and dequeued from multi-element + // blocks; this is the smallest controllable unit. If you expect few elements + // but many producers, a smaller block size should be favoured. For few producers + // and/or many elements, a larger block size is preferred. A sane default + // is provided. Must be a power of 2. + static const size_t BLOCK_SIZE = 32; + + // For explicit producers (i.e. when using a producer token), the block is + // checked for being empty by iterating through a list of flags, one per element. + // For large block sizes, this is too inefficient, and switching to an atomic + // counter-based approach is faster. The switch is made for block sizes strictly + // larger than this threshold. + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; + + // How many full blocks can be expected for a single explicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; + + // How many full blocks can be expected for a single implicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; + + // The initial size of the hash table mapping thread IDs to implicit producers. + // Note that the hash is resized every time it becomes half full. + // Must be a power of two, and either 0 or at least 1. If 0, implicit production + // (using the enqueue methods without an explicit producer token) is disabled. + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; + + // Controls the number of items that an explicit consumer (i.e. one with a token) + // must consume before it causes all consumers to rotate and move on to the next + // internal queue. + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; + + // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. + // Enqueue operations that would cause this limit to be surpassed will fail. Note + // that this limit is enforced at the block level (for performance reasons), i.e. + // it's rounded up to the nearest block size. + static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max::value; + + +#ifndef MCDBGQ_USE_RELACY + // Memory allocation can be customized if needed. + // malloc should return nullptr on failure, and handle alignment like std::malloc. +#if defined(malloc) || defined(free) + // Gah, this is 2015, stop defining macros that break standard code already! + // Work around malloc/free being special macros: + static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } + static inline void WORKAROUND_free(void* ptr) { return free(ptr); } + static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); } + static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); } +#else + static inline void* malloc(size_t size) { return std::malloc(size); } + static inline void free(void* ptr) { return std::free(ptr); } +#endif +#else + // Debug versions when running under the Relacy race detector (ignore + // these in user code) + static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); } + static inline void free(void* ptr) { return rl::rl_free(ptr, $); } +#endif +}; + + +// When producing or consuming many elements, the most efficient way is to: +// 1) Use one of the bulk-operation methods of the queue with a token +// 2) Failing that, use the bulk-operation methods without a token +// 3) Failing that, create a token and use that with the single-item methods +// 4) Failing that, use the single-parameter methods of the queue +// Having said that, don't create tokens willy-nilly -- ideally there should be +// a maximum of one token per thread (of each kind). +struct ProducerToken; +struct ConsumerToken; + +template class ConcurrentQueue; +template class BlockingConcurrentQueue; +class ConcurrentQueueTests; + + +namespace details +{ + struct ConcurrentQueueProducerTypelessBase + { + ConcurrentQueueProducerTypelessBase* next; + std::atomic inactive; + ProducerToken* token; + + ConcurrentQueueProducerTypelessBase() + : next(nullptr), inactive(false), token(nullptr) + { + } + }; + + template struct _hash_32_or_64 { + static inline std::uint32_t hash(std::uint32_t h) + { + // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp + // Since the thread ID is already unique, all we really want to do is propagate that + // uniqueness evenly across all the bits, so that we can use a subset of the bits while + // reducing collisions significantly + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + return h ^ (h >> 16); + } + }; + template<> struct _hash_32_or_64<1> { + static inline std::uint64_t hash(std::uint64_t h) + { + h ^= h >> 33; + h *= 0xff51afd7ed558ccd; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53; + return h ^ (h >> 33); + } + }; + template struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> { }; + + static inline size_t hash_thread_id(thread_id_t id) + { + static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values"); + return static_cast(hash_32_or_64::thread_id_hash_t)>::hash( + thread_id_converter::prehash(id))); + } + + template + static inline bool circular_less_than(T a, T b) + { +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4554) +#endif + static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "circular_less_than is intended to be used only with unsigned integer types"); + return static_cast(a - b) > static_cast(static_cast(1) << static_cast(sizeof(T) * CHAR_BIT - 1)); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + } + + template + static inline char* align_for(char* ptr) + { + const std::size_t alignment = std::alignment_of::value; + return ptr + (alignment - (reinterpret_cast(ptr) % alignment)) % alignment; + } + + template + static inline T ceil_to_pow_2(T x) + { + static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types"); + + // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + for (std::size_t i = 1; i < sizeof(T); i <<= 1) { + x |= x >> (i << 3); + } + ++x; + return x; + } + + template + static inline void swap_relaxed(std::atomic& left, std::atomic& right) + { + T temp = std::move(left.load(std::memory_order_relaxed)); + left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed); + right.store(std::move(temp), std::memory_order_relaxed); + } + + template + static inline T const& nomove(T const& x) + { + return x; + } + + template + struct nomove_if + { + template + static inline T const& eval(T const& x) + { + return x; + } + }; + + template<> + struct nomove_if + { + template + static inline auto eval(U&& x) + -> decltype(std::forward(x)) + { + return std::forward(x); + } + }; + + template + static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) + { + return *it; + } + +#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + template struct is_trivially_destructible : std::is_trivially_destructible { }; +#else + template struct is_trivially_destructible : std::has_trivial_destructor { }; +#endif + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY + typedef RelacyThreadExitListener ThreadExitListener; + typedef RelacyThreadExitNotifier ThreadExitNotifier; +#else + struct ThreadExitListener + { + typedef void (*callback_t)(void*); + callback_t callback; + void* userData; + + ThreadExitListener* next; // reserved for use by the ThreadExitNotifier + }; + + + class ThreadExitNotifier + { + public: + static void subscribe(ThreadExitListener* listener) + { + auto& tlsInst = instance(); + listener->next = tlsInst.tail; + tlsInst.tail = listener; + } + + static void unsubscribe(ThreadExitListener* listener) + { + auto& tlsInst = instance(); + ThreadExitListener** prev = &tlsInst.tail; + for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { + if (ptr == listener) { + *prev = ptr->next; + break; + } + prev = &ptr->next; + } + } + + private: + ThreadExitNotifier() : tail(nullptr) { } + ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; + ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; + + ~ThreadExitNotifier() + { + // This thread is about to exit, let everyone know! + assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); + for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { + ptr->callback(ptr->userData); + } + } + + // Thread-local + static inline ThreadExitNotifier& instance() + { + static thread_local ThreadExitNotifier notifier; + return notifier; + } + + private: + ThreadExitListener* tail; + }; +#endif +#endif + + template struct static_is_lock_free_num { enum { value = 0 }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_CHAR_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_SHORT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_INT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_LONG_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_LLONG_LOCK_FREE }; }; + template struct static_is_lock_free : static_is_lock_free_num::type> { }; + template<> struct static_is_lock_free { enum { value = ATOMIC_BOOL_LOCK_FREE }; }; + template struct static_is_lock_free { enum { value = ATOMIC_POINTER_LOCK_FREE }; }; +} + + +struct ProducerToken +{ + template + explicit ProducerToken(ConcurrentQueue& queue); + + template + explicit ProducerToken(BlockingConcurrentQueue& queue); + + ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + : producer(other.producer) + { + other.producer = nullptr; + if (producer != nullptr) { + producer->token = this; + } + } + + inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(producer, other.producer); + if (producer != nullptr) { + producer->token = this; + } + if (other.producer != nullptr) { + other.producer->token = &other; + } + } + + // A token is always valid unless: + // 1) Memory allocation failed during construction + // 2) It was moved via the move constructor + // (Note: assignment does a swap, leaving both potentially valid) + // 3) The associated queue was destroyed + // Note that if valid() returns true, that only indicates + // that the token is valid for use with a specific queue, + // but not which one; that's up to the user to track. + inline bool valid() const { return producer != nullptr; } + + ~ProducerToken() + { + if (producer != nullptr) { + producer->token = nullptr; + producer->inactive.store(true, std::memory_order_release); + } + } + + // Disable copying and assignment + ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +private: + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +protected: + details::ConcurrentQueueProducerTypelessBase* producer; +}; + + +struct ConsumerToken +{ + template + explicit ConsumerToken(ConcurrentQueue& q); + + template + explicit ConsumerToken(BlockingConcurrentQueue& q); + + ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + : initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer) + { + } + + inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(initialOffset, other.initialOffset); + std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); + std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); + std::swap(currentProducer, other.currentProducer); + std::swap(desiredProducer, other.desiredProducer); + } + + // Disable copying and assignment + ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +private: + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +private: // but shared with ConcurrentQueue + std::uint32_t initialOffset; + std::uint32_t lastKnownGlobalOffset; + std::uint32_t itemsConsumedFromCurrent; + details::ConcurrentQueueProducerTypelessBase* currentProducer; + details::ConcurrentQueueProducerTypelessBase* desiredProducer; +}; + +// Need to forward-declare this swap because it's in a namespace. +// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT; + + +template +class ConcurrentQueue +{ +public: + typedef ::moodycamel::ProducerToken producer_token_t; + typedef ::moodycamel::ConsumerToken consumer_token_t; + + typedef typename Traits::index_t index_t; + typedef typename Traits::size_t size_t; + + static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4307) // + integral constant overflow (that's what the ternary expression is for!) +#pragma warning(disable: 4309) // static_cast: Truncation of constant value +#endif + static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max::value - static_cast(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max::value : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::size_t must be an unsigned integral type"); + static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::index_t must be an unsigned integral type"); + static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t"); + static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); + static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)"); + static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); + static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)"); + +public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be allocated + // up-front, which means only a single producer will be able to enqueue elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads (this + // includes making the memory effects of construction visible, possibly with a + // memory barrier). + explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + // Track all the producers using a fully-resolved typed list for + // each kind; this makes it possible to debug them starting from + // the root queue object (otherwise wacky casts are needed that + // don't compile in the debugger's expression evaluator). + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Computes the correct amount of pre-allocated blocks for you based + // on the minimum number of elements you want available at any given + // time, and the maximum concurrent number of each type of producer. + ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers); + populate_initial_block_list(blocks); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Note: The queue should not be accessed concurrently while it's + // being deleted. It's up to the user to synchronize this. + // This method is not thread safe. + ~ConcurrentQueue() + { + // Destroy producers + auto ptr = producerListTail.load(std::memory_order_relaxed); + while (ptr != nullptr) { + auto next = ptr->next_prod(); + if (ptr->token != nullptr) { + ptr->token->producer = nullptr; + } + destroy(ptr); + ptr = next; + } + + // Destroy implicit producer hash tables + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) { + auto hash = implicitProducerHash.load(std::memory_order_relaxed); + while (hash != nullptr) { + auto prev = hash->prev; + if (prev != nullptr) { // The last hash is part of this object and was not allocated dynamically + for (size_t i = 0; i != hash->capacity; ++i) { + hash->entries[i].~ImplicitProducerKVP(); + } + hash->~ImplicitProducerHash(); + (Traits::free)(hash); + } + hash = prev; + } + } + + // Destroy global free list + auto block = freeList.head_unsafe(); + while (block != nullptr) { + auto next = block->freeListNext.load(std::memory_order_relaxed); + if (block->dynamicallyAllocated) { + destroy(block); + } + block = next; + } + + // Destroy initial free list + destroy_array(initialBlockPool, initialBlockPoolSize); + } + + // Disable copying and copy assignment + ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + : producerListTail(other.producerListTail.load(std::memory_order_relaxed)), + producerCount(other.producerCount.load(std::memory_order_relaxed)), + initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)), + initialBlockPool(other.initialBlockPool), + initialBlockPoolSize(other.initialBlockPoolSize), + freeList(std::move(other.freeList)), + nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)), + globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed)) + { + // Move the other one into this, and leave the other one as an empty queue + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + swap_implicit_producer_hashes(other); + + other.producerListTail.store(nullptr, std::memory_order_relaxed); + other.producerCount.store(0, std::memory_order_relaxed); + other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); + other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + + other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); + other.initialBlockPoolSize = 0; + other.initialBlockPool = nullptr; + + reown_producers(); + } + + inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + { + return swap_internal(other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT + { + swap_internal(other); + } + +private: + ConcurrentQueue& swap_internal(ConcurrentQueue& other) + { + if (this == &other) { + return *this; + } + + details::swap_relaxed(producerListTail, other.producerListTail); + details::swap_relaxed(producerCount, other.producerCount); + details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex); + std::swap(initialBlockPool, other.initialBlockPool); + std::swap(initialBlockPoolSize, other.initialBlockPoolSize); + freeList.swap(other.freeList); + details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId); + details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset); + + swap_implicit_producer_hashes(other); + + reown_producers(); + other.reown_producers(); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + details::swap_relaxed(explicitProducers, other.explicitProducers); + details::swap_relaxed(implicitProducers, other.implicitProducers); +#endif + + return *this; + } + +public: + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const& item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T&& item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T const& item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved instead of copied. + // Thread-safe. + template + bool enqueue_bulk(It itemFirst, size_t count) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0). + // Thread-safe. + inline bool try_enqueue(T const& item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T&& item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T const& item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(It itemFirst, size_t count) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(U& item) + { + // Instead of simply trying each producer in turn (which could cause needless contention on the first + // producer), we score them heuristically. + size_t nonEmptyCount = 0; + ProducerBase* best = nullptr; + size_t bestSize = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) { + auto size = ptr->size_approx(); + if (size > 0) { + if (size > bestSize) { + bestSize = size; + best = ptr; + } + ++nonEmptyCount; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (nonEmptyCount > 0) { + if ((details::likely)(best->dequeue(item))) { + return true; + } + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr != best && ptr->dequeue(item)) { + return true; + } + } + } + return false; + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // This differs from the try_dequeue(item) method in that this one does + // not attempt to reduce contention by interleaving the order that producer + // streams are dequeued from. So, using this method can reduce overall throughput + // under contention, but will give more predictable results in single-threaded + // consumer scenarios. This is mostly only useful for internal unit tests. + // Never allocates. Thread-safe. + template + bool try_dequeue_non_interleaved(U& item) + { + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->dequeue(item)) { + return true; + } + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(consumer_token_t& token, U& item) + { + // The idea is roughly as follows: + // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less + // If you see that the global offset has changed, you must reset your consumption counter and move to your designated place + // If there's no items where you're supposed to be, keep moving until you find a producer with some items + // If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it + + if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return false; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (static_cast(token.currentProducer)->dequeue(item)) { + if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return true; + } + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + if (ptr->dequeue(item)) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = 1; + return true; + } + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + count += ptr->dequeue_bulk(itemFirst, max - count); + if (count == max) { + break; + } + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return 0; + } + } + + size_t count = static_cast(token.currentProducer)->dequeue_bulk(itemFirst, max); + if (count == max) { + if ((token.itemsConsumedFromCurrent += static_cast(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return max; + } + token.itemsConsumedFromCurrent += static_cast(count); + max -= count; + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + auto dequeued = ptr->dequeue_bulk(itemFirst, max); + count += dequeued; + if (dequeued != 0) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = static_cast(dequeued); + } + if (dequeued == max) { + break; + } + max -= dequeued; + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return count; + } + + + + // Attempts to dequeue from a specific producer's inner queue. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns false if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item) + { + return static_cast(producer.producer)->dequeue(item); + } + + // Attempts to dequeue several elements from a specific producer's inner queue. + // Returns the number of items actually dequeued. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns 0 if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max) + { + return static_cast(producer.producer)->dequeue_bulk(itemFirst, max); + } + + + // Returns an estimate of the total number of elements currently in the queue. This + // estimate is only accurate if the queue has completely stabilized before it is called + // (i.e. all enqueue and dequeue operations have completed and their memory effects are + // visible on the calling thread, and no further operations start while this method is + // being called). + // Thread-safe. + size_t size_approx() const + { + size_t size = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + size += ptr->size_approx(); + } + return size; + } + + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static bool is_lock_free() + { + return + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::thread_id_numeric_size_t>::value == 2; + } + + +private: + friend struct ProducerToken; + friend struct ConsumerToken; + struct ExplicitProducer; + friend struct ExplicitProducer; + struct ImplicitProducer; + friend struct ImplicitProducer; + friend class ConcurrentQueueTests; + + enum AllocationMode { CanAlloc, CannotAlloc }; + + + /////////////////////////////// + // Queue methods + /////////////////////////////// + + template + inline bool inner_enqueue(producer_token_t const& token, U&& element) + { + return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue(U&& element) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk(itemFirst, count); + } + + template + inline bool inner_enqueue_bulk(It itemFirst, size_t count) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk(itemFirst, count); + } + + inline bool update_current_producer_after_rotation(consumer_token_t& token) + { + // Ah, there's been a rotation, figure out where we should be! + auto tail = producerListTail.load(std::memory_order_acquire); + if (token.desiredProducer == nullptr && tail == nullptr) { + return false; + } + auto prodCount = producerCount.load(std::memory_order_relaxed); + auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed); + if ((details::unlikely)(token.desiredProducer == nullptr)) { + // Aha, first time we're dequeueing anything. + // Figure out our local position + // Note: offset is from start, not end, but we're traversing from end -- subtract from count first + std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); + token.desiredProducer = tail; + for (std::uint32_t i = 0; i != offset; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + } + + std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; + if (delta >= prodCount) { + delta = delta % prodCount; + } + for (std::uint32_t i = 0; i != delta; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + + token.lastKnownGlobalOffset = globalOffset; + token.currentProducer = token.desiredProducer; + token.itemsConsumedFromCurrent = 0; + return true; + } + + + /////////////////////////// + // Free list + /////////////////////////// + + template + struct FreeListNode + { + FreeListNode() : freeListRefs(0), freeListNext(nullptr) { } + + std::atomic freeListRefs; + std::atomic freeListNext; + }; + + // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but + // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly + // speedy under low contention. + template // N must inherit FreeListNode or have the same fields (and initialization of them) + struct FreeList + { + FreeList() : freeListHead(nullptr) { } + FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); } + void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); } + + FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + + inline void add(N* node) + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to + // set it using a fetch_add + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) { + // Oh look! We were the last ones referencing this node, and we know + // we want to add it to the free list, so let's do it! + add_knowing_refcount_is_zero(node); + } + } + + inline N* try_get() + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + auto head = freeListHead.load(std::memory_order_acquire); + while (head != nullptr) { + auto prevHead = head; + auto refs = head->freeListRefs.load(std::memory_order_relaxed); + if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) { + head = freeListHead.load(std::memory_order_acquire); + continue; + } + + // Good, reference count has been incremented (it wasn't at zero), which means we can read the + // next and not worry about it changing between now and the time we do the CAS + auto next = head->freeListNext.load(std::memory_order_relaxed); + if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) { + // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no + // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on). + assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0); + + // Decrease refcount twice, once for our ref, and once for the list's ref + head->freeListRefs.fetch_sub(2, std::memory_order_release); + return head; + } + + // OK, the head must have changed on us, but we still need to decrease the refcount we increased. + // Note that we don't need to release any memory effects, but we do need to ensure that the reference + // count decrement happens-after the CAS on the head. + refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); + if (refs == SHOULD_BE_ON_FREELIST + 1) { + add_knowing_refcount_is_zero(prevHead); + } + } + + return nullptr; + } + + // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) + N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); } + + private: + inline void add_knowing_refcount_is_zero(N* node) + { + // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run + // only one copy of this method per node at a time, i.e. the single thread case), then we know + // we can safely change the next pointer of the node; however, once the refcount is back above + // zero, then other threads could increase it (happens under heavy contention, when the refcount + // goes to zero in between a load and a refcount increment of a node in try_get, then back up to + // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS + // to add the node to the actual list fails, decrease the refcount and leave the add operation to + // the next thread who puts the refcount back at zero (which could be us, hence the loop). + auto head = freeListHead.load(std::memory_order_relaxed); + while (true) { + node->freeListNext.store(head, std::memory_order_relaxed); + node->freeListRefs.store(1, std::memory_order_release); + if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) { + // Hmm, the add failed, but we can only try again when the refcount goes back to zero + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) { + continue; + } + } + return; + } + } + + private: + // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention) + std::atomic freeListHead; + + static const std::uint32_t REFS_MASK = 0x7FFFFFFF; + static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; + +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugMutex mutex; +#endif + }; + + + /////////////////////////// + // Block + /////////////////////////// + + enum InnerQueueContext { implicit_context = 0, explicit_context = 1 }; + + struct Block + { + Block() + : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true) + { +#ifdef MCDBGQ_TRACKMEM + owner = nullptr; +#endif + } + + template + inline bool is_empty() const + { + if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Check flags + for (size_t i = 0; i < BLOCK_SIZE; ++i) { + if (!emptyFlags[i].load(std::memory_order_relaxed)) { + return false; + } + } + + // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + else { + // Check counter + if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) { + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE); + return false; + } + } + + // Returns true if the block is now empty (does not apply in explicit context) + template + inline bool set_empty(index_t i) + { + if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flag + assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].load(std::memory_order_relaxed)); + emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].store(true, std::memory_order_release); + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release); + assert(prevVal < BLOCK_SIZE); + return prevVal == BLOCK_SIZE - 1; + } + } + + // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). + // Returns true if the block is now empty (does not apply in explicit context). + template + inline bool set_many_empty(index_t i, size_t count) + { + if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flags + std::atomic_thread_fence(std::memory_order_release); + i = BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1)) - count + 1; + for (size_t j = 0; j != count; ++j) { + assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); + emptyFlags[i + j].store(true, std::memory_order_relaxed); + } + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release); + assert(prevVal + count <= BLOCK_SIZE); + return prevVal + count == BLOCK_SIZE; + } + } + + template + inline void set_all_empty() + { + if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set all flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(true, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed); + } + } + + template + inline void reset_empty() + { + if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Reset flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(false, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(0, std::memory_order_relaxed); + } + } + + inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } + inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } + + private: + // IMPORTANT: This must be the first member in Block, so that if T depends on the alignment of + // addresses returned by malloc, that alignment will be preserved. Apparently clang actually + // generates code that uses this assumption for AVX instructions in some cases. Ideally, we + // should also align Block to the alignment of T in case it's higher than malloc's 16-byte + // alignment, but this is hard to do in a cross-platform way. Assert for this case: + static_assert(std::alignment_of::value <= std::alignment_of::value, "The queue does not support super-aligned types at this time"); + // Additionally, we need the alignment of Block itself to be a multiple of max_align_t since + // otherwise the appropriate padding will not be added at the end of Block in order to make + // arrays of Blocks all be properly aligned (not just the first one). We use a union to force + // this. + union { + char elements[sizeof(T) * BLOCK_SIZE]; + details::max_align_t dummy; + }; + public: + Block* next; + std::atomic elementsCompletelyDequeued; + std::atomic emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1]; + public: + std::atomic freeListRefs; + std::atomic freeListNext; + std::atomic shouldBeOnFreeList; + bool dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' + +#ifdef MCDBGQ_TRACKMEM + void* owner; +#endif + }; + static_assert(std::alignment_of::value >= std::alignment_of::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping"); + + +#ifdef MCDBGQ_TRACKMEM +public: + struct MemStats; +private: +#endif + + /////////////////////////// + // Producer base + /////////////////////////// + + struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase + { + ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) : + tailIndex(0), + headIndex(0), + dequeueOptimisticCount(0), + dequeueOvercommit(0), + tailBlock(nullptr), + isExplicit(isExplicit_), + parent(parent_) + { + } + + virtual ~ProducerBase() { }; + + template + inline bool dequeue(U& element) + { + if (isExplicit) { + return static_cast(this)->dequeue(element); + } + else { + return static_cast(this)->dequeue(element); + } + } + + template + inline size_t dequeue_bulk(It& itemFirst, size_t max) + { + if (isExplicit) { + return static_cast(this)->dequeue_bulk(itemFirst, max); + } + else { + return static_cast(this)->dequeue_bulk(itemFirst, max); + } + } + + inline ProducerBase* next_prod() const { return static_cast(next); } + + inline size_t size_approx() const + { + auto tail = tailIndex.load(std::memory_order_relaxed); + auto head = headIndex.load(std::memory_order_relaxed); + return details::circular_less_than(head, tail) ? static_cast(tail - head) : 0; + } + + inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); } + protected: + std::atomic tailIndex; // Where to enqueue to next + std::atomic headIndex; // Where to dequeue from next + + std::atomic dequeueOptimisticCount; + std::atomic dequeueOvercommit; + + Block* tailBlock; + + public: + bool isExplicit; + ConcurrentQueue* parent; + + protected: +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + /////////////////////////// + // Explicit queue + /////////////////////////// + + struct ExplicitProducer : public ProducerBase + { + explicit ExplicitProducer(ConcurrentQueue* parent) : + ProducerBase(parent, true), + blockIndex(nullptr), + pr_blockIndexSlotsUsed(0), + pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), + pr_blockIndexFront(0), + pr_blockIndexEntries(nullptr), + pr_blockIndexRaw(nullptr) + { + size_t poolBasedIndexSize = details::ceil_to_pow_2(parent->initialBlockPoolSize) >> 1; + if (poolBasedIndexSize > pr_blockIndexSize) { + pr_blockIndexSize = poolBasedIndexSize; + } + + new_block_index(0); // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE + } + + ~ExplicitProducer() + { + // Destruct any elements not yet dequeued. + // Since we're in the destructor, we can assume all elements + // are either completely dequeued or completely not (no halfways). + if (this->tailBlock != nullptr) { // Note this means there must be a block index too + // First find the block that's partially dequeued, if any + Block* halfDequeuedBlock = nullptr; + if ((this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) != 0) { + // The head's not on a block boundary, meaning a block somewhere is partially dequeued + // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary) + size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1); + while (details::circular_less_than(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) { + i = (i + 1) & (pr_blockIndexSize - 1); + } + assert(details::circular_less_than(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed))); + halfDequeuedBlock = pr_blockIndexEntries[i].block; + } + + // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration) + auto block = this->tailBlock; + do { + block = block->next; + if (block->ConcurrentQueue::Block::template is_empty()) { + continue; + } + + size_t i = 0; // Offset into block + if (block == halfDequeuedBlock) { + i = static_cast(this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + } + + // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index + auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast(this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) { + (*block)[i++]->~T(); + } + } while (block != this->tailBlock); + } + + // Destroy all blocks that we own + if (this->tailBlock != nullptr) { + auto block = this->tailBlock; + do { + auto nextBlock = block->next; + if (block->dynamicallyAllocated) { + destroy(block); + } + else { + this->parent->add_block_to_free_list(block); + } + block = nextBlock; + } while (block != this->tailBlock); + } + + // Destroy the block indices + auto header = static_cast(pr_blockIndexRaw); + while (header != nullptr) { + auto prev = static_cast(header->prev); + header->~BlockIndexHeader(); + (Traits::free)(header); + header = prev; + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto startBlock = this->tailBlock; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { + // We can re-use the block ahead of us, it's empty! + this->tailBlock = this->tailBlock->next; + this->tailBlock->ConcurrentQueue::Block::template reset_empty(); + + // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the + // last block from it first -- except instead of removing then adding, we can just overwrite). + // Note that there must be a valid block index here, since even if allocation failed in the ctor, + // it would have been re-attempted when adding the first block to the queue; since there is such + // a block, a block index must have been successfully allocated. + } + else { + // Whatever head value we see here is >= the last value we saw here (relatively), + // and <= its current value. Since we have the most recent tail, the head must be + // <= to it. + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) + || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + // We can't enqueue in another block because there's not enough leeway -- the + // tail could surpass the head by the time the block fills up! (Or we'll exceed + // the size limit, if the second part of the condition was true.) + return false; + } + // We're going to need a new block; check that the block index has room + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) { + // Hmm, the circular block index is already full -- we'll need + // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if + // the initial allocation failed in the constructor. + + if (allocMode == CannotAlloc || !new_block_index(pr_blockIndexSlotsUsed)) { + return false; + } + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } + else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + ++pr_blockIndexSlotsUsed; + } + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { + // The constructor may throw. We want the element not to appear in the queue in + // that case (without corrupting the queue): + MOODYCAMEL_TRY { + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + } + MOODYCAMEL_CATCH (...) { + // Revert change to the current block, but leave the new block available + // for next time + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock; + MOODYCAMEL_RETHROW; + } + } + else { + (void)startBlock; + (void)originalBlockIndexSlotsUsed; + } + + // Add block to block index + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release); + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { + // Might be something to dequeue, let's give it a try + + // Note that this if is purely for performance purposes in the common case when the queue is + // empty and the values are eventually consistent -- we may enter here spuriously. + + // Note that whatever the values of overcommit and tail are, they are not going to change (unless we + // change them) and must be the same value at this point (inside the if) as when the if condition was + // evaluated. + + // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below. + // This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in + // the fetch_add below will result in a value at least as recent as that (and therefore at least as large). + // Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all + // read-modify-write operations are guaranteed to work on the latest value in the modification order), but + // unfortunately that can't be shown to be correct using only the C++11 standard. + // See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case + std::atomic_thread_fence(std::memory_order_acquire); + + // Increment optimistic counter, then check if it went over the boundary + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); + + // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever + // incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now + // have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon + // incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount. + // However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently) + // overflow; in such a case, though, the logic still holds since the difference between the two is maintained. + + // Note that we reload tail here in case it changed; it will be the same value as before or greater, since + // this load is sequenced after (happens after) the earlier load above. This is supported by read-read + // coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { + // Guaranteed to be at least one element to dequeue! + + // Get the index. Note that since there's guaranteed to be at least one element, this + // will never exceed tail. We need to do an acquire-release fence here since it's possible + // that whatever condition got us to this point was for an earlier enqueued element (that + // we already see the memory effects for), but that by the time we increment somebody else + // has incremented it, and we need to see the memory effects for *that* element, which is + // in such a case is necessarily visible on the thread that incremented it in the first + // place with the more current condition (they must have acquired a tail that is at least + // as recent). + auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + + // Determine which block the element is in + + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + // We need to be careful here about subtracting and dividing because of index wrap-around. + // When an index wraps, we need to preserve the sign of the offset when dividing it by the + // block size (in order to get a correct signed block count offset in all cases): + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto blockBaseIndex = index & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast(static_cast::type>(blockBaseIndex - headBase) / BLOCK_SIZE); + auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block; + + // Dequeue + auto& el = *((*block)[index]); + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { + // Make sure the element is still fully dequeued and destroyed even if the assignment + // throws + struct Guard { + Block* block; + index_t index; + + ~Guard() + { + (*block)[index]->~T(); + block->ConcurrentQueue::Block::template set_empty(index); + } + } guard = { block, index }; + + element = std::move(el); // NOLINT + } + else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + block->ConcurrentQueue::Block::template set_empty(index); + } + + return true; + } + else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add(1, std::memory_order_release); // Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write + } + } + + return false; + } + + template + bool enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + auto originalBlockIndexFront = pr_blockIndexFront; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + + Block* firstAllocatedBlock = nullptr; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { + // Allocate as many blocks as possible from ahead + while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + this->tailBlock = this->tailBlock->next; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; + + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Now allocate as many blocks as necessary from the block pool + while (blockBaseDiff > 0) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) { + if (allocMode == CannotAlloc || full || !new_block_index(originalBlockIndexSlotsUsed)) { + // Failed to allocate, undo changes (but keep injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + + // pr_blockIndexFront is updated inside new_block_index, so we need to + // update our fallback value too (since we keep the new index even if we + // later fail) + originalBlockIndexFront = originalBlockIndexSlotsUsed; + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template set_all_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } + else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; + + ++pr_blockIndexSlotsUsed; + + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and + // publish the new block index front + auto block = firstAllocatedBlock; + while (true) { + block->ConcurrentQueue::Block::template reset_empty(); + if (block == this->tailBlock) { + break; + } + block = block->next; + } + + if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) { + blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); + } + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + auto endBlock = this->tailBlock; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + auto stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else { + MOODYCAMEL_TRY { + while (currentTailIndex != stopIndex) { + // Must use copy constructor even if move constructor is available + // because we may have to revert if there's an exception. + // Sorry about the horrible templated next line, but it was the only way + // to disable moving *at compile time*, which is important because a type + // may only define a (noexcept) move constructor, and so calls to the + // cctor will not compile, even if they are in an if branch that will never + // be executed + new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH (...) { + // Oh dear, an exception's been thrown -- destroy the elements that + // were enqueued so far and revert the entire bulk operation (we'll keep + // any allocated blocks in our linked list for later, though). + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst))) && firstAllocatedBlock != nullptr) { + blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); + } + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);; + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Determine which block the first element is in + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto firstBlockBaseIndex = firstIndex & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast(static_cast::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE); + auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1); + + // Iterate the blocks and dequeue + auto index = firstIndex; + do { + auto firstIndexInBlock = index; + auto endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + auto block = localBlockIndex->entries[indexIndex].block; + if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else { + MOODYCAMEL_TRY { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH (...) { + // It's too late to revert the dequeue, but we can make sure that all + // the dequeued objects are properly destroyed and the block index + // (and empty count) are properly updated before we propagate the exception + do { + block = localBlockIndex->entries[indexIndex].block; + while (index != endIndex) { + (*block)[index++]->~T(); + } + block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + + firstIndexInBlock = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + struct BlockIndexEntry + { + index_t base; + Block* block; + }; + + struct BlockIndexHeader + { + size_t size; + std::atomic front; // Current slot (not next, like pr_blockIndexFront) + BlockIndexEntry* entries; + void* prev; + }; + + + bool new_block_index(size_t numberOfFilledSlotsToExpose) + { + auto prevBlockSizeMask = pr_blockIndexSize - 1; + + // Create the new block + pr_blockIndexSize <<= 1; + auto newRawPtr = static_cast((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize)); + if (newRawPtr == nullptr) { + pr_blockIndexSize >>= 1; // Reset to allow graceful retry + return false; + } + + auto newBlockIndexEntries = reinterpret_cast(details::align_for(newRawPtr + sizeof(BlockIndexHeader))); + + // Copy in all the old indices, if any + size_t j = 0; + if (pr_blockIndexSlotsUsed != 0) { + auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; + do { + newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; + i = (i + 1) & prevBlockSizeMask; + } while (i != pr_blockIndexFront); + } + + // Update everything + auto header = new (newRawPtr) BlockIndexHeader; + header->size = pr_blockIndexSize; + header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed); + header->entries = newBlockIndexEntries; + header->prev = pr_blockIndexRaw; // we link the new block to the old one so we can free it later + + pr_blockIndexFront = j; + pr_blockIndexEntries = newBlockIndexEntries; + pr_blockIndexRaw = newRawPtr; + blockIndex.store(header, std::memory_order_release); + + return true; + } + + private: + std::atomic blockIndex; + + // To be used by producer only -- consumer must use the ones in referenced by blockIndex + size_t pr_blockIndexSlotsUsed; + size_t pr_blockIndexSize; + size_t pr_blockIndexFront; // Next slot (not current) + BlockIndexEntry* pr_blockIndexEntries; + void* pr_blockIndexRaw; + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ExplicitProducer* nextExplicitProducer; + private: +#endif + +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + ////////////////////////////////// + // Implicit queue + ////////////////////////////////// + + struct ImplicitProducer : public ProducerBase + { + ImplicitProducer(ConcurrentQueue* parent) : + ProducerBase(parent, false), + nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), + blockIndex(nullptr) + { + new_block_index(); + } + + ~ImplicitProducer() + { + // Note that since we're in the destructor we can assume that all enqueue/dequeue operations + // completed already; this means that all undequeued elements are placed contiguously across + // contiguous blocks, and that only the first and last remaining blocks can be only partially + // empty (all other remaining blocks must be completely full). + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + // Unregister ourselves for thread termination notification + if (!this->inactive.load(std::memory_order_relaxed)) { + details::ThreadExitNotifier::unsubscribe(&threadExitListener); + } +#endif + + // Destroy all remaining elements! + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto index = this->headIndex.load(std::memory_order_relaxed); + Block* block = nullptr; + assert(index == tail || details::circular_less_than(index, tail)); + bool forceFreeLastBlock = index != tail; // If we enter the loop, then the last (tail) block will not be freed + while (index != tail) { + if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || block == nullptr) { + if (block != nullptr) { + // Free the old block + this->parent->add_block_to_free_list(block); + } + + block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed); + } + + ((*block)[index])->~T(); + ++index; + } + // Even if the queue is empty, there's still one block that's not on the free list + // (unless the head index reached the end of it, in which case the tail will be poised + // to create a new block). + if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast(BLOCK_SIZE - 1)) != 0)) { + this->parent->add_block_to_free_list(this->tailBlock); + } + + // Destroy block index + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + if (localBlockIndex != nullptr) { + for (size_t i = 0; i != localBlockIndex->capacity; ++i) { + localBlockIndex->index[i]->~BlockIndexEntry(); + } + do { + auto prev = localBlockIndex->prev; + localBlockIndex->~BlockIndexHeader(); + (Traits::free)(localBlockIndex); + localBlockIndex = prev; + } while (localBlockIndex != nullptr); + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + return false; + } +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Find out where we'll be inserting this block in the block index + BlockIndexEntry* idxEntry; + if (!insert_block_index_entry(idxEntry, currentTailIndex)) { + return false; + } + + // Get ahold of a new block + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { + // May throw, try to insert now before we publish the fact that we have this new block + MOODYCAMEL_TRY { + new ((*newBlock)[currentTailIndex]) T(std::forward(element)); + } + MOODYCAMEL_CATCH (...) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(newBlock); + MOODYCAMEL_RETHROW; + } + } + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + this->tailBlock = newBlock; + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + // See ExplicitProducer::dequeue for rationale and explanation + index_t tail = this->tailIndex.load(std::memory_order_relaxed); + index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { + std::atomic_thread_fence(std::memory_order_acquire); + + index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { + index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + auto entry = get_block_index_entry_for_index(index); + + // Dequeue + auto block = entry->value.load(std::memory_order_relaxed); + auto& el = *((*block)[index]); + + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + // Note: Acquiring the mutex with every dequeue instead of only when a block + // is released is very sub-optimal, but it is, after all, purely debug code. + debug::DebugLock lock(producer->mutex); +#endif + struct Guard { + Block* block; + index_t index; + BlockIndexEntry* entry; + ConcurrentQueue* parent; + + ~Guard() + { + (*block)[index]->~T(); + if (block->ConcurrentQueue::Block::template set_empty(index)) { + entry->value.store(nullptr, std::memory_order_relaxed); + parent->add_block_to_free_list(block); + } + } + } guard = { block, index, entry, this->parent }; + + element = std::move(el); // NOLINT + } + else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + + if (block->ConcurrentQueue::Block::template set_empty(index)) { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Add the block back into the global free pool (and remove from block index) + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list(block); // releases the above store + } + } + + return true; + } + else { + this->dequeueOvercommit.fetch_add(1, std::memory_order_release); + } + } + + return false; + } + + template + bool enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + + // Note that the tailBlock we start off with may not be owned by us any more; + // this happens if it was filled up exactly to the top (setting tailIndex to + // the first index of the next block which is not yet allocated), then dequeued + // completely (putting it on the free list) before we enqueue again. + + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + Block* firstAllocatedBlock = nullptr; + auto endBlock = this->tailBlock; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + do { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + // Find out where we'll be inserting this block in the block index + BlockIndexEntry* idxEntry = nullptr; // initialization here unnecessary but compiler can't always tell + Block* newBlock; + bool indexInserted = false; + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + if (full || !(indexInserted = insert_block_index_entry(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block()) == nullptr) { + // Index allocation or block allocation failed; revert any other allocations + // and index insertions done so far for this operation + if (indexInserted) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + } + currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + newBlock->next = nullptr; + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + // Store the chain of blocks so that we can undo if later allocations fail, + // and so that we can find the blocks when we do the actual enqueueing + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) { + assert(this->tailBlock != nullptr); + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + endBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock; + } while (blockBaseDiff > 0); + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + auto stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else { + MOODYCAMEL_TRY { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH (...) { + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + + currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + auto idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Iterate the blocks and dequeue + auto index = firstIndex; + BlockIndexHeader* localBlockIndex; + auto indexIndex = get_block_index_index_for_index(index, localBlockIndex); + do { + auto blockStartIndex = index; + auto endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + + auto entry = localBlockIndex->index[indexIndex]; + auto block = entry->value.load(std::memory_order_relaxed); + if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else { + MOODYCAMEL_TRY { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH (...) { + do { + entry = localBlockIndex->index[indexIndex]; + block = entry->value.load(std::memory_order_relaxed); + while (index != endIndex) { + (*block)[index++]->~T(); + } + + if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + entry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(block); + } + indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); + + blockStartIndex = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Note that the set_many_empty above did a release, meaning that anybody who acquires the block + // we're about to free can use it safely since our writes (and reads!) will have happened-before then. + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list(block); // releases the above store + } + indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else { + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + // The block size must be > 1, so any number with the low bit set is an invalid block base index + static const index_t INVALID_BLOCK_BASE = 1; + + struct BlockIndexEntry + { + std::atomic key; + std::atomic value; + }; + + struct BlockIndexHeader + { + size_t capacity; + std::atomic tail; + BlockIndexEntry* entries; + BlockIndexEntry** index; + BlockIndexHeader* prev; + }; + + template + inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex) + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); // We're the only writer thread, relaxed is OK + if (localBlockIndex == nullptr) { + return false; // this can happen if new_block_index failed in the constructor + } + auto newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE || + idxEntry->value.load(std::memory_order_relaxed) == nullptr) { + + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + // No room in the old block index, try to allocate another one! + if (allocMode == CannotAlloc || !new_block_index()) { + return false; + } + localBlockIndex = blockIndex.load(std::memory_order_relaxed); + newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE); + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + inline void rewind_block_index_tail() + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed); + } + + inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const + { + BlockIndexHeader* localBlockIndex; + auto idx = get_block_index_index_for_index(index, localBlockIndex); + return localBlockIndex->index[idx]; + } + + inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + index &= ~static_cast(BLOCK_SIZE - 1); + localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto tail = localBlockIndex->tail.load(std::memory_order_acquire); + auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed); + assert(tailBase != INVALID_BLOCK_BASE); + // Note: Must use division instead of shift because the index may wrap around, causing a negative + // offset, whose negativity we want to preserve + auto offset = static_cast(static_cast::type>(index - tailBase) / BLOCK_SIZE); + size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); + assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr); + return idx; + } + + bool new_block_index() + { + auto prev = blockIndex.load(std::memory_order_relaxed); + size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; + auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity; + auto raw = static_cast((Traits::malloc)( + sizeof(BlockIndexHeader) + + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * entryCount + + std::alignment_of::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity)); + if (raw == nullptr) { + return false; + } + + auto header = new (raw) BlockIndexHeader; + auto entries = reinterpret_cast(details::align_for(raw + sizeof(BlockIndexHeader))); + auto index = reinterpret_cast(details::align_for(reinterpret_cast(entries) + sizeof(BlockIndexEntry) * entryCount)); + if (prev != nullptr) { + auto prevTail = prev->tail.load(std::memory_order_relaxed); + auto prevPos = prevTail; + size_t i = 0; + do { + prevPos = (prevPos + 1) & (prev->capacity - 1); + index[i++] = prev->index[prevPos]; + } while (prevPos != prevTail); + assert(i == prevCapacity); + } + for (size_t i = 0; i != entryCount; ++i) { + new (entries + i) BlockIndexEntry; + entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed); + index[prevCapacity + i] = entries + i; + } + header->prev = prev; + header->entries = entries; + header->index = index; + header->capacity = nextBlockIndexCapacity; + header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed); + + blockIndex.store(header, std::memory_order_release); + + nextBlockIndexCapacity <<= 1; + + return true; + } + + private: + size_t nextBlockIndexCapacity; + std::atomic blockIndex; + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + public: + details::ThreadExitListener threadExitListener; + private: +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ImplicitProducer* nextImplicitProducer; + private: +#endif + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + mutable debug::DebugMutex mutex; +#endif +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + ////////////////////////////////// + // Block pool manipulation + ////////////////////////////////// + + void populate_initial_block_list(size_t blockCount) + { + initialBlockPoolSize = blockCount; + if (initialBlockPoolSize == 0) { + initialBlockPool = nullptr; + return; + } + + initialBlockPool = create_array(blockCount); + if (initialBlockPool == nullptr) { + initialBlockPoolSize = 0; + } + for (size_t i = 0; i < initialBlockPoolSize; ++i) { + initialBlockPool[i].dynamicallyAllocated = false; + } + } + + inline Block* try_get_block_from_initial_pool() + { + if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) { + return nullptr; + } + + auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); + + return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; + } + + inline void add_block_to_free_list(Block* block) + { +#ifdef MCDBGQ_TRACKMEM + block->owner = nullptr; +#endif + freeList.add(block); + } + + inline void add_blocks_to_free_list(Block* block) + { + while (block != nullptr) { + auto next = block->next; + add_block_to_free_list(block); + block = next; + } + } + + inline Block* try_get_block_from_free_list() + { + return freeList.try_get(); + } + + // Gets a free block from one of the memory pools, or allocates a new one (if applicable) + template + Block* requisition_block() + { + auto block = try_get_block_from_initial_pool(); + if (block != nullptr) { + return block; + } + + block = try_get_block_from_free_list(); + if (block != nullptr) { + return block; + } + + if (canAlloc == CanAlloc) { + return create(); + } + + return nullptr; + } + + +#ifdef MCDBGQ_TRACKMEM + public: + struct MemStats { + size_t allocatedBlocks; + size_t usedBlocks; + size_t freeBlocks; + size_t ownedBlocksExplicit; + size_t ownedBlocksImplicit; + size_t implicitProducers; + size_t explicitProducers; + size_t elementsEnqueued; + size_t blockClassBytes; + size_t queueClassBytes; + size_t implicitBlockIndexBytes; + size_t explicitBlockIndexBytes; + + friend class ConcurrentQueue; + + private: + static MemStats getFor(ConcurrentQueue* q) + { + MemStats stats = { 0 }; + + stats.elementsEnqueued = q->size_approx(); + + auto block = q->freeList.head_unsafe(); + while (block != nullptr) { + ++stats.allocatedBlocks; + ++stats.freeBlocks; + block = block->freeListNext.load(std::memory_order_relaxed); + } + + for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + bool implicit = dynamic_cast(ptr) != nullptr; + stats.implicitProducers += implicit ? 1 : 0; + stats.explicitProducers += implicit ? 0 : 1; + + if (implicit) { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ImplicitProducer); + auto head = prod->headIndex.load(std::memory_order_relaxed); + auto tail = prod->tailIndex.load(std::memory_order_relaxed); + auto hash = prod->blockIndex.load(std::memory_order_relaxed); + if (hash != nullptr) { + for (size_t i = 0; i != hash->capacity; ++i) { + if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) { + ++stats.allocatedBlocks; + ++stats.ownedBlocksImplicit; + } + } + stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry); + for (; hash != nullptr; hash = hash->prev) { + stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*); + } + } + for (; details::circular_less_than(head, tail); head += BLOCK_SIZE) { + //auto block = prod->get_block_index_entry_for_index(head); + ++stats.usedBlocks; + } + } + else { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ExplicitProducer); + auto tailBlock = prod->tailBlock; + bool wasNonEmpty = false; + if (tailBlock != nullptr) { + auto block = tailBlock; + do { + ++stats.allocatedBlocks; + if (!block->ConcurrentQueue::Block::template is_empty() || wasNonEmpty) { + ++stats.usedBlocks; + wasNonEmpty = wasNonEmpty || block != tailBlock; + } + ++stats.ownedBlocksExplicit; + block = block->next; + } while (block != tailBlock); + } + auto index = prod->blockIndex.load(std::memory_order_relaxed); + while (index != nullptr) { + stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry); + index = static_cast(index->prev); + } + } + } + + auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed); + stats.allocatedBlocks += freeOnInitialPool; + stats.freeBlocks += freeOnInitialPool; + + stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; + stats.queueClassBytes += sizeof(ConcurrentQueue); + + return stats; + } + }; + + // For debugging only. Not thread-safe. + MemStats getMemStats() + { + return MemStats::getFor(this); + } + private: + friend struct MemStats; +#endif + + + ////////////////////////////////// + // Producer list manipulation + ////////////////////////////////// + + ProducerBase* recycle_or_create_producer(bool isExplicit) + { + bool recycled; + return recycle_or_create_producer(isExplicit, recycled); + } + + ProducerBase* recycle_or_create_producer(bool isExplicit, bool& recycled) + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + // Try to re-use one first + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) { + bool expected = true; + if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) { + // We caught one! It's been marked as activated, the caller can have it + recycled = true; + return ptr; + } + } + } + + recycled = false; + return add_producer(isExplicit ? static_cast(create(this)) : create(this)); + } + + ProducerBase* add_producer(ProducerBase* producer) + { + // Handle failed memory allocation + if (producer == nullptr) { + return nullptr; + } + + producerCount.fetch_add(1, std::memory_order_relaxed); + + // Add it to the lock-free list + auto prevTail = producerListTail.load(std::memory_order_relaxed); + do { + producer->next = prevTail; + } while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + if (producer->isExplicit) { + auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextExplicitProducer = prevTailExplicit; + } while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); + } + else { + auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextImplicitProducer = prevTailImplicit; + } while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); + } +#endif + + return producer; + } + + void reown_producers() + { + // After another instance is moved-into/swapped-with this one, all the + // producers we stole still think their parents are the other queue. + // So fix them up! + for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) { + ptr->parent = this; + } + } + + + ////////////////////////////////// + // Implicit producer hash + ////////////////////////////////// + + struct ImplicitProducerKVP + { + std::atomic key; + ImplicitProducer* value; // No need for atomicity since it's only read by the thread that sets it in the first place + + ImplicitProducerKVP() : value(nullptr) { } + + ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT + { + key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed); + value = other.value; + } + + inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT + { + if (this != &other) { + details::swap_relaxed(key, other.key); + std::swap(value, other.value); + } + } + }; + + template + friend void moodycamel::swap(typename ConcurrentQueue::ImplicitProducerKVP&, typename ConcurrentQueue::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT; + + struct ImplicitProducerHash + { + size_t capacity; + ImplicitProducerKVP* entries; + ImplicitProducerHash* prev; + }; + + inline void populate_initial_implicit_producer_hash() + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return; + + implicitProducerHashCount.store(0, std::memory_order_relaxed); + auto hash = &initialImplicitProducerHash; + hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + hash->entries = &initialImplicitProducerHashEntries[0]; + for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { + initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); + } + hash->prev = nullptr; + implicitProducerHash.store(hash, std::memory_order_relaxed); + } + + void swap_implicit_producer_hashes(ConcurrentQueue& other) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return; + + // Swap (assumes our implicit producer hash is initialized) + initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries); + initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0]; + other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0]; + + details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount); + + details::swap_relaxed(implicitProducerHash, other.implicitProducerHash); + if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) { + implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed); + } + else { + ImplicitProducerHash* hash; + for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) { + continue; + } + hash->prev = &initialImplicitProducerHash; + } + if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) { + other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed); + } + else { + ImplicitProducerHash* hash; + for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) { + continue; + } + hash->prev = &other.initialImplicitProducerHash; + } + } + + // Only fails (returns nullptr) if memory allocation fails + ImplicitProducer* get_or_add_implicit_producer() + { + // Note that since the data is essentially thread-local (key is thread ID), + // there's a reduced need for fences (memory ordering is already consistent + // for any individual thread), except for the current table itself. + + // Start by looking for the thread ID in the current and all previous hash tables. + // If it's not found, it must not be in there yet, since this same thread would + // have added it previously to one of the tables that we traversed. + + // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + + auto mainHash = implicitProducerHash.load(std::memory_order_acquire); + for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { + // Look for the id in this hash + auto index = hashedId; + while (true) { // Not an infinite loop because at least one slot is free in the hash table + index &= hash->capacity - 1; + + auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) { + // Found it! If we had to search several hashes deep, though, we should lazily add it + // to the current main hash table to avoid the extended search next time. + // Note there's guaranteed to be room in the current hash table since every subsequent + // table implicitly reserves space for all previous tables (there's only one + // implicitProducerHashCount). + auto value = hash->entries[index].value; + if (hash != mainHash) { + index = hashedId; + while (true) { + index &= mainHash->capacity - 1; + probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed); + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed)) || + (probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) { +#else + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed))) { +#endif + mainHash->entries[index].value = value; + break; + } + ++index; + } + } + + return value; + } + if (probedKey == details::invalid_thread_id) { + break; // Not in this hash table + } + ++index; + } + } + + // Insert! + auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed); + while (true) { + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) { + // We've acquired the resize lock, try to allocate a bigger hash table. + // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when + // we reload implicitProducerHash it must be the most recent version (it only gets changed within this + // locked block). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + if (newCount >= (mainHash->capacity >> 1)) { + auto newCapacity = mainHash->capacity << 1; + while (newCount >= (newCapacity >> 1)) { + newCapacity <<= 1; + } + auto raw = static_cast((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity)); + if (raw == nullptr) { + // Allocation failed + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + return nullptr; + } + + auto newHash = new (raw) ImplicitProducerHash; + newHash->capacity = newCapacity; + newHash->entries = reinterpret_cast(details::align_for(raw + sizeof(ImplicitProducerHash))); + for (size_t i = 0; i != newCapacity; ++i) { + new (newHash->entries + i) ImplicitProducerKVP; + newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); + } + newHash->prev = mainHash; + implicitProducerHash.store(newHash, std::memory_order_release); + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + mainHash = newHash; + } + else { + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + } + } + + // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table + // to finish being allocated by another thread (and if we just finished allocating above, the condition will + // always be true) + if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { + bool recycled; + auto producer = static_cast(recycle_or_create_producer(false, recycled)); + if (producer == nullptr) { + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + return nullptr; + } + if (recycled) { + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback; + producer->threadExitListener.userData = producer; + details::ThreadExitNotifier::subscribe(&producer->threadExitListener); +#endif + + auto index = hashedId; + while (true) { + index &= mainHash->capacity - 1; + auto probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed); + + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed)) || + (probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) { +#else + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed))) { +#endif + mainHash->entries[index].value = producer; + break; + } + ++index; + } + return producer; + } + + // Hmm, the old hash is quite full and somebody else is busy allocating a new one. + // We need to wait for the allocating thread to finish (if it succeeds, we add, if not, + // we try to allocate ourselves). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + } + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + void implicit_producer_thread_exited(ImplicitProducer* producer) + { + // Remove from thread exit listeners + details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener); + + // Remove from hash +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + auto hash = implicitProducerHash.load(std::memory_order_acquire); + assert(hash != nullptr); // The thread exit listener is only registered if we were added to a hash in the first place + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + details::thread_id_t probedKey; + + // We need to traverse all the hashes just in case other threads aren't on the current one yet and are + // trying to add an entry thinking there's a free slot (because they reused a producer) + for (; hash != nullptr; hash = hash->prev) { + auto index = hashedId; + do { + index &= hash->capacity - 1; + probedKey = hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) { + hash->entries[index].key.store(details::invalid_thread_id2, std::memory_order_release); + break; + } + ++index; + } while (probedKey != details::invalid_thread_id); // Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place + } + + // Mark the queue as being recyclable + producer->inactive.store(true, std::memory_order_release); + } + + static void implicit_producer_thread_exited_callback(void* userData) + { + auto producer = static_cast(userData); + auto queue = producer->parent; + queue->implicit_producer_thread_exited(producer); + } +#endif + + ////////////////////////////////// + // Utility functions + ////////////////////////////////// + + template + static inline U* create_array(size_t count) + { + assert(count > 0); + auto p = static_cast((Traits::malloc)(sizeof(U) * count)); + if (p == nullptr) { + return nullptr; + } + + for (size_t i = 0; i != count; ++i) { + new (p + i) U(); + } + return p; + } + + template + static inline void destroy_array(U* p, size_t count) + { + if (p != nullptr) { + assert(count > 0); + for (size_t i = count; i != 0; ) { + (p + --i)->~U(); + } + (Traits::free)(p); + } + } + + template + static inline U* create() + { + auto p = (Traits::malloc)(sizeof(U)); + return p != nullptr ? new (p) U : nullptr; + } + + template + static inline U* create(A1&& a1) + { + auto p = (Traits::malloc)(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; + } + + template + static inline void destroy(U* p) + { + if (p != nullptr) { + p->~U(); + } + (Traits::free)(p); + } + +private: + std::atomic producerListTail; + std::atomic producerCount; + + std::atomic initialBlockPoolIndex; + Block* initialBlockPool; + size_t initialBlockPoolSize; + +#if !MCDBGQ_USEDEBUGFREELIST + FreeList freeList; +#else + debug::DebugFreeList freeList; +#endif + + std::atomic implicitProducerHash; + std::atomic implicitProducerHashCount; // Number of slots logically used + ImplicitProducerHash initialImplicitProducerHash; + std::array initialImplicitProducerHashEntries; + std::atomic_flag implicitProducerHashResizeInProgress; + + std::atomic nextExplicitConsumerId; + std::atomic globalExplicitConsumerOffset; + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugMutex implicitProdMutex; +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + std::atomic explicitProducers; + std::atomic implicitProducers; +#endif +}; + + +template +ProducerToken::ProducerToken(ConcurrentQueue& queue) + : producer(queue.recycle_or_create_producer(true)) +{ + if (producer != nullptr) { + producer->token = this; + } +} + +template +ProducerToken::ProducerToken(BlockingConcurrentQueue& queue) + : producer(reinterpret_cast*>(&queue)->recycle_or_create_producer(true)) +{ + if (producer != nullptr) { + producer->token = this; + } +} + +template +ConsumerToken::ConsumerToken(ConcurrentQueue& queue) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) +{ + initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = -1; +} + +template +ConsumerToken::ConsumerToken(BlockingConcurrentQueue& queue) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) +{ + initialOffset = reinterpret_cast*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = -1; +} + +template +inline void swap(ConcurrentQueue& a, ConcurrentQueue& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +} + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif diff --git a/rpmp/test/CMakeLists.txt b/rpmp/test/CMakeLists.txt new file mode 100644 index 00000000..17a8c9e5 --- /dev/null +++ b/rpmp/test/CMakeLists.txt @@ -0,0 +1,7 @@ +add_executable(unit_tests unit_test/main.cc unit_test/DigestTest.cc unit_test/CircularBufferTest.cc) +target_link_libraries(unit_tests gtest_main pmpool) + +add_test(NAME unit_tests COMMAND unit_tests) + +add_executable(RemoteRead integration_test/RemoteRead.cc) +target_link_libraries(RemoteRead pmpool) diff --git a/rpmp/test/integration_test/RemoteRead.cc b/rpmp/test/integration_test/RemoteRead.cc new file mode 100644 index 00000000..b7ac7b0c --- /dev/null +++ b/rpmp/test/integration_test/RemoteRead.cc @@ -0,0 +1,67 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/benchmark/allocate_perf.cc + * Path: /mnt/spark-pmof/tool/rpmp/benchmark + * Created Date: Friday, December 20th 2019, 8:29:23 am + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include +#include +#include // NOLINT +#include +#include "pmpool/client/PmPoolClient.h" + +std::vector strs; +char str_read[4096]; +int count = 0; +std::mutex mtx; +uint64_t address[2]; + +uint64_t timestamp_now() { + return std::chrono::high_resolution_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); +} + +void func(PmPoolClient* client) { + while (true) { + std::unique_lock lk(mtx); + uint64_t count_ = count++; + lk.unlock(); + if (count_ < 2) { + client->read(address[count_], str_read, strlen(strs[count_])); + assert(strncmp(str_read, strs[count_], strlen(strs[count_])) == 0); + } else { + break; + } + } +} + +int main() { + char str[] = "hello world"; + char str1[] = "hello rpmp"; + strs.push_back(str); + strs.push_back(str1); + std::vector threads; + int num = 0; + PmPoolClient client("172.168.0.40", "12346"); + client.init(); + address[0] = client.write(strs[0], strlen(strs[0])); + address[1] = client.write(strs[1], strlen(strs[1])); + for (int i = 0; i < 1; i++) { + num++; + auto t = new std::thread(func, &client); + threads.push_back(t); + } + for (int i = 0; i < num; i++) { + threads[i]->join(); + delete threads[i]; + } + client.free(address[0]); + client.free(address[1]); + std::cout << "finished." << std::endl; + client.shutdown(); + client.wait(); + return 0; +} diff --git a/rpmp/test/unit_test/CircularBufferTest.cc b/rpmp/test/unit_test/CircularBufferTest.cc new file mode 100644 index 00000000..e87a4bec --- /dev/null +++ b/rpmp/test/unit_test/CircularBufferTest.cc @@ -0,0 +1,114 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/test/CircularBufferTest.cc + * Path: /mnt/spark-pmof/tool/rpmp/test + * Created Date: Tuesday, December 24th 2019, 8:56:37 am + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include // NOLINT +#include // NOLINT +#include + +#include "../pmpool/buffer/CircularBuffer.h" +#include "gtest/gtest.h" + +#define private public + +TEST(circularbuffer, 1B) { + CircularBuffer buffer(1, 10); + uint64_t addr = 0; + buffer.get(2, &addr); + ASSERT_EQ(addr, 0); + ASSERT_EQ(buffer.get_write_(), 2); + buffer.get(2, &addr); + ASSERT_EQ(addr, 2); + ASSERT_EQ(buffer.get_write_(), 4); + buffer.get(2, &addr); + ASSERT_EQ(addr, 4); + ASSERT_EQ(buffer.get_write_(), 6); + buffer.get(2, &addr); + ASSERT_EQ(addr, 6); + ASSERT_EQ(buffer.get_write_(), 8); + buffer.get(2, &addr); + ASSERT_EQ(addr, 8); + ASSERT_EQ(buffer.get_write_(), 0); + buffer.put(addr, 2); + ASSERT_EQ(buffer.get_read_(), 0); + addr = 0; + buffer.put(addr, 4); + ASSERT_EQ(buffer.get_read_(), 4); + buffer.get(3, &addr); + ASSERT_EQ(addr, 0); + ASSERT_EQ(buffer.get_write_(), 3); + buffer.put(4, 4); + ASSERT_EQ(buffer.get_read_(), 0); + buffer.get(4, &addr); + ASSERT_EQ(addr, 3); + ASSERT_EQ(buffer.get_write_(), 7); + buffer.get(3, &addr); + ASSERT_EQ(addr, 7); + ASSERT_EQ(buffer.get_write_(), 0); + buffer.put(5, 4); + ASSERT_EQ(buffer.get_read_(), 0); + buffer.put(1, 4); + ASSERT_EQ(buffer.get_read_(), 0); + addr = 0; + buffer.put(addr, 1); + ASSERT_EQ(buffer.get_read_(), 9); + buffer.put(9, 1); + ASSERT_EQ(buffer.get_read_(), 0); +} + +TEST(circularbuffer, 4K) { + CircularBuffer buffer(4096, 4); + uint64_t addr = 0; + buffer.get(10, &addr); + ASSERT_EQ(addr, 0); + ASSERT_EQ(buffer.get_write_(), 1); + buffer.get(10, &addr); + ASSERT_EQ(addr, 1); + ASSERT_EQ(buffer.get_write_(), 2); + buffer.get(4097, &addr); + ASSERT_EQ(addr, 2); + ASSERT_EQ(buffer.get_write_(), 0); + buffer.put(2, 2); + ASSERT_EQ(buffer.get_read_(), 0); + buffer.put(3, 2); + ASSERT_EQ(buffer.get_read_(), 0); + addr = 0; + buffer.put(addr, 2); + ASSERT_EQ(buffer.get_read_(), 1); + buffer.put(1, 2); + ASSERT_EQ(buffer.get_read_(), 0); +} + +void func(CircularBuffer* buffer) { + std::cout << "sleep..." << std::endl; + std::this_thread::sleep_for(std::chrono::seconds(1)); + uint64_t addr = 0; + buffer->put(addr, 2); + std::cout << "put buffer [0, 1]..." << std::endl; + buffer->dump(); + std::this_thread::sleep_for(std::chrono::seconds(1)); + buffer->put(4, 2); + std::cout << "put buffer [4, 5]..." << std::endl; + buffer->dump(); + std::this_thread::sleep_for(std::chrono::seconds(1)); + buffer->put(2, 2); + std::cout << "put buffer [2, 3]..." << std::endl; + buffer->dump(); +} + +TEST(circularbuffer, multithread) { + CircularBuffer buffer(1, 8); + uint64_t addr = 0; + buffer.get(6, &addr); + ASSERT_EQ(addr, 0); + std::thread t(func, &buffer); + buffer.get(8, &addr); + std::cout << "get buffer..." << std::endl; + ASSERT_EQ(addr, 0); + t.join(); +} diff --git a/rpmp/test/unit_test/DigestTest.cc b/rpmp/test/unit_test/DigestTest.cc new file mode 100644 index 00000000..aa3d4ba0 --- /dev/null +++ b/rpmp/test/unit_test/DigestTest.cc @@ -0,0 +1,24 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/test/DigestTest.cc + * Path: /mnt/spark-pmof/tool/rpmp/test + * Created Date: Thursday, November 7th 2019, 3:48:52 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include +#include +#include + +#include "pmpool/Digest.h" +#include "gtest/gtest.h" + +TEST(digest, compute) { + std::string str = "hello world"; + uint64_t hash_value_1; + uint64_t hash_value_2; + Digest::computeKeyHash(str, &hash_value_1); + Digest::computeKeyHash(str, &hash_value_2); + ASSERT_TRUE(hash_value_1 == hash_value_2); +} diff --git a/rpmp/test/unit_test/main.cc b/rpmp/test/unit_test/main.cc new file mode 100644 index 00000000..796154a1 --- /dev/null +++ b/rpmp/test/unit_test/main.cc @@ -0,0 +1,15 @@ +/* + * Filename: /mnt/spark-pmof/tool/rpmp/test/main.cc + * Path: /mnt/spark-pmof/tool/rpmp/test + * Created Date: Thursday, November 7th 2019, 3:48:52 pm + * Author: root + * + * Copyright (c) 2019 Intel + */ + +#include "gtest/gtest.h" + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}