diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
deleted file mode 100644
index 1df96737..00000000
--- a/.gitlab-ci.yml
+++ /dev/null
@@ -1,6 +0,0 @@
-before_script:
-  - whoami; docker info
-
-build_image:
-  script:
-    - docker build -f docker/ubuntu18/DockerFile --build-arg http_proxy="$http_proxy" --build-arg https_proxy="$http_proxy" --build-arg MAVEN_OPTS="$MAVEN_OPTS" .
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..d62e6e63
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "rpmp/include/spdlog"]
+	path = rpmp/include/spdlog
+	url = https://github.com/gabime/spdlog.git
+	branch = master
diff --git a/core/src/main/scala/org/apache/spark/storage/pmof/RdmaShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/pmof/RdmaShuffleBlockFetcherIterator.scala
index 15b2a3ae..d3c3e064 100644
--- a/core/src/main/scala/org/apache/spark/storage/pmof/RdmaShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/pmof/RdmaShuffleBlockFetcherIterator.scala
@@ -357,7 +357,6 @@ final class RdmaShuffleBlockFetcherIterator(context: TaskContext,
     reqsInFlight.incrementAndGet
     val blockManagerId = rdmaRequest.blockManagerId
     val shuffleBlockIdName = rdmaRequest.shuffleBlockIdName
-    println("shuffle block name " + shuffleBlockIdName)
 
     val pmofTransferService = shuffleClient.asInstanceOf[PmofTransferService]
 
diff --git a/rpmp/CMakeLists.txt b/rpmp/CMakeLists.txt
new file mode 100644
index 00000000..0609bd26
--- /dev/null
+++ b/rpmp/CMakeLists.txt
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 3.11)
+project(rpmof)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "-std=c++14 -g -pthread -fPIC")
+
+# Generate compile_commands.json
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# place binaries and libraries according to GNU standards
+include(GNUInstallDirs)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR})
+
+if(CMAKE_CXX_COMPILER_ID MATCHES GNU)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage")
+endif()
+
+include(cmake/googletest.cmake)
+
+fetch_googletest(
+  ${PROJECT_SOURCE_DIR}/cmake
+  ${PROJECT_BINARY_DIR}/googletest
+)
+
+find_package(Boost REQUIRED COMPONENTS program_options)
+if(Boost_FOUND)
+  include_directories(${Boost_INCLUDE_DIRS})
+endif()
+
+include_directories(${PROJECT_SOURCE_DIR}/)
+include_directories(${PROJECT_SOURCE_DIR}/include)
+include_directories(${PROJECT_SOURCE_DIR}/include/spdlog/include)
+include_directories(${PROJECT_BINARY_DIR}/googletest/googletest-src/googletest/include)
+
+enable_testing()
+
+add_subdirectory(include/spdlog)
+add_subdirectory(pmpool)
+add_subdirectory(test)
+add_subdirectory(benchmark)
+
+add_executable(main main.cc)
+target_link_libraries(main pmpool spdlog)
diff --git a/rpmp/README.md b/rpmp/README.md
new file mode 100644
index 00000000..a217906e
--- /dev/null
+++ b/rpmp/README.md
@@ -0,0 +1 @@
+# RPMP
diff --git a/rpmp/benchmark/CMakeLists.txt b/rpmp/benchmark/CMakeLists.txt
new file mode 100644
index 00000000..468d6b15
--- /dev/null
+++ b/rpmp/benchmark/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_executable(local_allocate local_allocate.cc)
+target_link_libraries(local_allocate pmpool)
+
+add_executable(remote_allocate remote_allocate.cc)
+target_link_libraries(remote_allocate pmpool)
+
+add_executable(remote_write remote_write.cc)
+target_link_libraries(remote_write pmpool)
+
+add_executable(remote_allocate_write remote_allocate_write.cc)
+target_link_libraries(remote_allocate_write pmpool)
+
+add_executable(circularbuffer circularbuffer.cc)
+target_link_libraries(circularbuffer pmpool)
+
+add_executable(remote_read remote_read.cc)
+target_link_libraries(remote_read pmpool)
diff --git a/rpmp/benchmark/circularbuffer.cc b/rpmp/benchmark/circularbuffer.cc
new file mode 100644
index 00000000..35ee8047
--- /dev/null
+++ b/rpmp/benchmark/circularbuffer.cc
@@ -0,0 +1,35 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/benchmark/circularbuffer.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/benchmark
+ * Created Date: Monday, December 30th 2019, 9:57:10 am
+ * Author: root
+ *
+ * Copyright (c) 2019 Your Company
+ */
+
+#include <string.h>
+
+#include "pmpool/buffer/CircularBuffer.h"
+
+#include <chrono>  // NOLINT
+
+uint64_t timestamp_now() {
+  return std::chrono::high_resolution_clock::now().time_since_epoch() /
+         std::chrono::milliseconds(1);
+}
+
+int main() {
+  CircularBuffer circularbuffer(1024 * 1024, 2048);
+  uint64_t start = timestamp_now();
+  char str[1048576];
+  memset(str, '0', 1048576);
+  for (int i = 0; i < 20480; i++) {
+    char* buf = circularbuffer.get(1048576);
+    memcpy(buf, str, 1048576);
+    circularbuffer.put(buf, 1048576);
+  }
+  uint64_t end = timestamp_now();
+  std::cout << "pmemkv put test: 1048576 "
+            << " bytes test, consumes " << (end - start) / 1000.0 << std::endl;
+  return 0;
+}
\ No newline at end of file
diff --git a/rpmp/benchmark/local_allocate.cc b/rpmp/benchmark/local_allocate.cc
new file mode 100644
index 00000000..8faadd68
--- /dev/null
+++ b/rpmp/benchmark/local_allocate.cc
@@ -0,0 +1,85 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/benchmark/local_allocate.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/benchmark
+ * Created Date: Tuesday, December 24th 2019, 8:54:38 am
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#include <string.h>
+
+#include <iostream>
+#include <memory>
+#include <mutex>   // NOLINT
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "../pmpool/AllocatorProxy.h"
+#include "../pmpool/Config.h"
+#include "../pmpool/Log.h"
+#include "gtest/gtest.h"
+
+uint64_t timestamp_now() {
+  return std::chrono::high_resolution_clock::now().time_since_epoch() /
+         std::chrono::milliseconds(1);
+}
+
+std::mutex mtx;
+uint64_t count = 0;
+char str[1048576];
+
+void func(AllocatorProxy *proxy, int index) {
+  while (true) {
+    std::unique_lock<std::mutex> lk(mtx);
+    uint64_t count_ = count++;
+    lk.unlock();
+    if (count_ < 20480) {
+      uint64_t addr = proxy->allocate_and_write(1048576, nullptr, index);
+      proxy->write(addr, str, 1048576);
+    } else {
+      break;
+    }
+  }
+}
+
+int main() {
+  std::shared_ptr<Config> config = std::make_shared<Config>();
+  config->init(0, nullptr);
+  std::shared_ptr<Log> log = std::make_shared<Log>(config.get());
+  auto allocatorProxy = new AllocatorProxy(config.get(), log.get(), nullptr);
+  allocatorProxy->init();
+  std::vector<std::thread *> threads;
+  memset(str, '0', 1048576);
+
+  uint64_t start = timestamp_now();
+  int num = 0;
+  for (int i = 0; i < 4; i++) {
+    num++;
+    auto t = new std::thread(func, allocatorProxy, i);
+    threads.push_back(t);
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    if (i == 0) {
+      CPU_SET(2, &cpuset);
+    } else if (i == 1) {
+      CPU_SET(40, &cpuset);
+    } else if (i == 2) {
+      CPU_SET(27, &cpuset);
+    } else {
+      CPU_SET(60, &cpuset);
+    }
+    int rc =
+        pthread_setaffinity_np(t->native_handle(), sizeof(cpu_set_t), &cpuset);
+  }
+  for (int i = 0; i < num; i++) {
+    threads[i]->join();
+    delete threads[i];
+  }
+  uint64_t end = timestamp_now();
+  std::cout << "pmemkv put test: 1048576 "
+            << " bytes test, consumes " << (end - start) / 1000.0
+            << "s, throughput is " << 20480 / ((end - start) / 1000.0) << "MB/s"
+            << std::endl;
+  allocatorProxy->release_all();
+}
diff --git a/rpmp/benchmark/remote_allocate.cc b/rpmp/benchmark/remote_allocate.cc
new file mode 100644
index 00000000..7762783a
--- /dev/null
+++ b/rpmp/benchmark/remote_allocate.cc
@@ -0,0 +1,86 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/benchmark/allocate_perf.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/benchmark
+ * Created Date: Friday, December 20th 2019, 8:29:23 am
+ * Author: root
+ * 
+ * Copyright (c) 2019 Intel
+ */
+
+#include <thread> // NOLINT
+#include <atomic>
+#include "pmpool/client/PmPoolClient.h"
+
+uint64_t timestamp_now() {
+  return std::chrono::high_resolution_clock::now().time_since_epoch() /
+         std::chrono::milliseconds(1);
+}
+
+std::atomic<uint64_t> count = {0};
+std::mutex mtx;
+std::vector<PmPoolClient *> clients;
+std::map<int, std::vector<uint64_t>> addresses;
+
+void func(int i) {
+  while (true) {
+    uint64_t count_ = count++;
+    if (count_ < 20480) {
+      clients[i]->begin_tx();
+      if (addresses.count(i) != 0) {
+        auto vec = addresses[i];
+        uint64_t addr = clients[i]->alloc(1048576);
+        vec.push_back(addr);
+      } else {
+        std::vector<uint64_t> vec;
+        uint64_t addr = clients[i]->alloc(1048576);
+        vec.push_back(addr);
+        addresses[i] = vec;
+      }
+      clients[i]->end_tx();
+    } else {
+      break;
+    }
+  }
+}
+
+int main() {
+  std::vector<std::thread*> threads;
+  int num = 0;
+  for (int i = 0; i < 4; i++) {
+    PmPoolClient *client = new PmPoolClient("172.168.0.40", "12346");
+    client->begin_tx();
+    client->init();
+    client->end_tx();
+    clients.push_back(client);
+    num++;
+  }
+  uint64_t start = timestamp_now();
+  for (int i = 0; i < num; i++) {
+    auto t = new std::thread(func, i);
+    threads.push_back(t);
+  }
+  for (int i = 0; i < num; i++) {
+    threads[i]->join();
+    delete threads[i];
+  }
+  uint64_t end = timestamp_now();
+  std::cout << "pmemkv put test: 1048576 "
+            << " bytes test, consumes " << (end - start) / 1000.0
+            << "s, throughput is " << 20480 / ((end - start) / 1000.0) << "MB/s"
+            << std::endl;
+
+  for (int i = 0; i < num; i++) {
+    auto vec = addresses[i];
+    while (!vec.empty()) {
+      auto address = vec.back();
+      vec.pop_back();
+      clients[i]->free(address);
+    }
+  }
+  std::cout << "freed." << std::endl;
+  for (int i = 0; i < num; i++) {
+    clients[i]->wait();
+    delete clients[i];
+  }
+  return 0;
+}
diff --git a/rpmp/benchmark/remote_allocate_write.cc b/rpmp/benchmark/remote_allocate_write.cc
new file mode 100644
index 00000000..2a1f5da7
--- /dev/null
+++ b/rpmp/benchmark/remote_allocate_write.cc
@@ -0,0 +1,90 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/benchmark/allocate_perf.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/benchmark
+ * Created Date: Friday, December 20th 2019, 8:29:23 am
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#include <string.h>
+#include <thread>  // NOLINT
+#include <atomic>
+#include "pmpool/client/PmPoolClient.h"
+
+uint64_t timestamp_now() {
+  return std::chrono::high_resolution_clock::now().time_since_epoch() /
+         std::chrono::milliseconds(1);
+}
+
+std::atomic<uint64_t> count = {0};
+std::mutex mtx;
+char str[1048576];
+std::vector<PmPoolClient *> clients;
+std::map<int, std::vector<uint64_t>> addresses;
+
+void func1(int i) {
+  while (true) {
+    uint64_t count_ = count++;
+    if (count_ < 20480) {
+      clients[i]->begin_tx();
+      if (addresses.count(i) != 0) {
+        auto vec = addresses[i];
+        vec.push_back(clients[i]->write(str, 1048576));
+      } else {
+        std::vector<uint64_t> vec;
+        vec.push_back(clients[i]->write(str, 1048576));
+        addresses[i] = vec;
+      }
+      clients[i]->end_tx();
+    } else {
+      break;
+    }
+  }
+}
+
+int main() {
+  std::vector<std::thread*> threads;
+  memset(str, '0', 1048576);
+
+  int num = 0;
+  std::cout << "start write." << std::endl;
+  num = 0;
+  count = 0;
+  for (int i = 0; i < 4; i++) {
+    PmPoolClient *client = new PmPoolClient("172.168.0.40", "12346");
+    client->begin_tx();
+    client->init();
+    client->end_tx();
+    clients.push_back(client);
+    num++;
+  }
+  uint64_t start = timestamp_now();
+  for (int i = 0; i < num; i++) {
+    auto t = new std::thread(func1, i);
+    threads.push_back(t);
+  }
+  for (int i = 0; i < num; i++) {
+    threads[i]->join();
+    delete threads[i];
+  }
+  uint64_t end = timestamp_now();
+  std::cout << "pmemkv put test: 1048576 "
+            << " bytes test, consumes " << (end - start) / 1000.0
+            << "s, throughput is " << 20480 / ((end - start) / 1000.0) << "MB/s"
+            << std::endl;
+  for (int i = 0; i < num; i++) {
+    auto vec = addresses[i];
+    while (!vec.empty()) {
+      auto address = vec.back();
+      vec.pop_back();
+      clients[i]->free(address);
+    }
+  }
+  std::cout << "freed." << std::endl;
+  for (int i = 0; i < num; i++) {
+    clients[i]->wait();
+    delete clients[i];
+  }
+  return 0;
+}
diff --git a/rpmp/benchmark/remote_read.cc b/rpmp/benchmark/remote_read.cc
new file mode 100644
index 00000000..ec1c74af
--- /dev/null
+++ b/rpmp/benchmark/remote_read.cc
@@ -0,0 +1,82 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/benchmark/allocate_perf.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/benchmark
+ * Created Date: Friday, December 20th 2019, 8:29:23 am
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#include <string.h>
+#include <atomic>
+#include <thread>  // NOLINT
+#include "pmpool/Event.h"
+#include "pmpool/client/PmPoolClient.h"
+
+char str[1048576];
+char str_read[1048576];
+std::atomic<uint64_t> count = {0};
+std::mutex mtx;
+std::vector<PmPoolClient *> clients;
+std::vector<uint64_t> addresses;
+uint64_t buffer_size = 1024*64;
+uint64_t buffer_num = 1000000;
+int thread_num = 1;
+
+uint64_t timestamp_now() {
+  return std::chrono::high_resolution_clock::now().time_since_epoch() /
+         std::chrono::milliseconds(1);
+}
+
+void func(uint64_t i) {
+  while (true) {
+    uint64_t count_ = count++;
+    if (count_ < buffer_num) {
+      clients[i]->begin_tx();
+      clients[i]->read(addresses[i], str_read, buffer_size);
+      clients[i]->end_tx();
+    } else {
+      break;
+    }
+  }
+}
+
+int main() {
+  std::vector<std::thread *> threads;
+
+  memset(str, '0', buffer_size);
+  for (int i = 0; i < thread_num; i++) {
+    PmPoolClient *client = new PmPoolClient("172.168.0.40", "12346");
+    client->init();
+    client->begin_tx();
+    addresses.push_back(client->write(str, buffer_size));
+    client->end_tx();
+    clients.push_back(client);
+  }
+  uint64_t start = timestamp_now();
+  for (int i = 0; i < thread_num; i++) {
+    auto t = new std::thread(func, i);
+    threads.push_back(t);
+  }
+  for (int i = 0; i < thread_num; i++) {
+    threads[i]->join();
+    delete threads[i];
+  }
+  uint64_t end = timestamp_now();
+  std::cout << "pmemkv put test: " << buffer_size << " "
+            << " bytes test, consumes " << (end - start) / 1000.0
+            << "s, throughput is "
+            << buffer_num / 1024.0 * buffer_size / 1024.0 /
+                   ((end - start) / 1000.0)
+            << "MB/s" << std::endl;
+  for (int i = 0; i < thread_num; i++) {
+    clients[i]->begin_tx();
+    clients[i]->free(addresses[i]);
+    clients[i]->end_tx();
+  }
+  std::cout << "finished." << std::endl;
+  for (int i = 0; i < thread_num; i++) {
+    clients[i]->wait();
+  }
+  return 0;
+}
diff --git a/rpmp/benchmark/remote_write.cc b/rpmp/benchmark/remote_write.cc
new file mode 100644
index 00000000..f2fb1d0f
--- /dev/null
+++ b/rpmp/benchmark/remote_write.cc
@@ -0,0 +1,98 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/benchmark/allocate_perf.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/benchmark
+ * Created Date: Friday, December 20th 2019, 8:29:23 am
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#include <string.h>
+#include <thread>  // NOLINT
+#include "pmpool/client/PmPoolClient.h"
+
+uint64_t timestamp_now() {
+  return std::chrono::high_resolution_clock::now().time_since_epoch() /
+         std::chrono::milliseconds(1);
+}
+
+int count = 0;
+std::mutex mtx;
+uint64_t addresses[20480];
+char str[1048576];
+
+void func(PmPoolClient* client) {
+  while (true) {
+    std::unique_lock<std::mutex> lk(mtx);
+    uint64_t count_ = count++;
+    lk.unlock();
+    if (count_ < 20480) {
+      client->begin_tx();
+      auto addr = client->alloc(1048576);
+      client->end_tx();
+      addresses[count_] = addr;
+    } else {
+      break;
+    }
+  }
+}
+
+void func1(PmPoolClient* client) {
+  while (true) {
+    std::unique_lock<std::mutex> lk(mtx);
+    uint64_t count_ = count++;
+    lk.unlock();
+    if (count_ < 20480) {
+      client->begin_tx();
+      client->write(addresses[count_], str, 1048576);
+      client->end_tx();
+    } else {
+      break;
+    }
+  }
+}
+
+int main() {
+  std::vector<std::thread*> threads;
+  PmPoolClient client("172.168.0.40", "12346");
+  memset(str, '0', 1048576);
+  client.init();
+
+  int num = 0;
+  for (int i = 0; i < 1; i++) {
+    num++;
+    auto t = new std::thread(func, &client);
+    threads.push_back(t);
+  }
+  for (int i = 0; i < num; i++) {
+    threads[i]->join();
+    delete threads[i];
+  }
+  std::cout << "start write." << std::endl;
+  num = 0;
+  count = 0;
+  std::vector<std::thread*> threads_1;
+  uint64_t start = timestamp_now();
+  for (int i = 0; i < 8; i++) {
+    num++;
+    auto t = new std::thread(func1, &client);
+    threads_1.push_back(t);
+  }
+  for (int i = 0; i < num; i++) {
+    threads_1[i]->join();
+    delete threads_1[i];
+  }
+  uint64_t end = timestamp_now();
+  std::cout << "pmemkv put test: 1048576 "
+            << " bytes test, consumes " << (end - start) / 1000.0
+            << "s, throughput is " << 20480 / ((end - start) / 1000.0) << "MB/s"
+            << std::endl;
+  for (int i = 0; i < 20480; i++) {
+    client.begin_tx();
+    client.free(addresses[i]);
+    client.end_tx();
+  }
+  std::cout << "freed." << std::endl;
+  client.wait();
+  return 0;
+}
diff --git a/rpmp/cmake/googletest-download.cmake b/rpmp/cmake/googletest-download.cmake
new file mode 100644
index 00000000..313be3c8
--- /dev/null
+++ b/rpmp/cmake/googletest-download.cmake
@@ -0,0 +1,20 @@
+# code copied from https://crascit.com/2015/07/25/cmake-gtest/
+cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+
+project(googletest-download NONE)
+
+include(ExternalProject)
+
+ExternalProject_Add(
+  googletest
+  SOURCE_DIR "@GOOGLETEST_DOWNLOAD_ROOT@/googletest-src"
+  BINARY_DIR "@GOOGLETEST_DOWNLOAD_ROOT@/googletest-build"
+  GIT_REPOSITORY
+    https://github.com/google/googletest.git
+  GIT_TAG
+    release-1.8.1
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND ""
+  )
diff --git a/rpmp/cmake/googletest.cmake b/rpmp/cmake/googletest.cmake
new file mode 100644
index 00000000..5ca70908
--- /dev/null
+++ b/rpmp/cmake/googletest.cmake
@@ -0,0 +1,32 @@
+# the following code to fetch googletest
+# is inspired by and adapted after https://crascit.com/2015/07/25/cmake-gtest/
+# download and unpack googletest at configure time
+
+macro(fetch_googletest _download_module_path _download_root)
+    set(GOOGLETEST_DOWNLOAD_ROOT ${_download_root})
+    configure_file(
+        ${_download_module_path}/googletest-download.cmake
+        ${_download_root}/CMakeLists.txt
+        @ONLY
+        )
+    unset(GOOGLETEST_DOWNLOAD_ROOT)
+
+    execute_process(
+        COMMAND
+            "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" .
+        WORKING_DIRECTORY
+            ${_download_root}
+        )
+    execute_process(
+        COMMAND
+            "${CMAKE_COMMAND}" --build .
+        WORKING_DIRECTORY
+            ${_download_root}
+        )
+
+    # adds the targers: gtest, gtest_main, gmock, gmock_main
+    add_subdirectory(
+        ${_download_root}/googletest-src
+        ${_download_root}/googletest-build
+        )
+endmacro()
diff --git a/rpmp/include/xxhash/xxhash.h b/rpmp/include/xxhash/xxhash.h
new file mode 100644
index 00000000..01488ef5
--- /dev/null
+++ b/rpmp/include/xxhash/xxhash.h
@@ -0,0 +1,321 @@
+#pragma once
+#pragma clang system_header
+#pragma gcc system_header
+/*
+xxHash - Extremely Fast Hash algorithm
+Header File
+Copyright (C) 2012-2016, Yann Collet.
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You can contact the author at :
+- xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* Notice extracted from xxHash homepage :
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+	/* ****************************
+	*  Definitions
+	******************************/
+#include <stddef.h>   /* size_t */
+	typedef enum { XXH_OK = 0, XXH_ERROR } XXH_errorcode;
+
+
+	/* ****************************
+	*  API modifier
+	******************************/
+	/** XXH_INLINE_ALL (and XXH_PRIVATE_API)
+	*  This is useful to include xxhash functions in `static` mode
+	*  in order to inline them, and remove their symbol from the public list.
+	*  Inlining can offer dramatic performance improvement on small keys.
+	*  Methodology :
+	*     #define XXH_INLINE_ALL
+	*     #include "xxhash.h"
+	* `xxhash.c` is automatically included.
+	*  It's not useful to compile and link it as a separate module.
+	*/
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  ifndef XXH_STATIC_LINKING_ONLY
+#    define XXH_STATIC_LINKING_ONLY
+#  endif
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+	/* this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+#else
+#  define XXH_PUBLIC_API   /* do nothing */
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+	/*! XXH_NAMESPACE, aka Namespace Emulation :
+	*
+	* If you want to include _and expose_ xxHash functions from within your own library,
+	* but also want to avoid symbol collisions with other libraries which may also include xxHash,
+	*
+	* you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+	* with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values).
+	*
+	* Note that no change is required within the calling program as long as it includes `xxhash.h` :
+	* regular symbol name will be automatically translated by this header.
+	*/
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+	/* *************************************
+	*  Version
+	***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    6
+#define XXH_VERSION_RELEASE  5
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+	XXH_PUBLIC_API unsigned XXH_versionNumber(void);
+
+
+	/*-**********************************************************************
+	*  32-bit hash
+	************************************************************************/
+	typedef unsigned int XXH32_hash_t;
+
+	/*! XXH32() :
+	Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
+	The memory between input & input+length must be valid (allocated and read-accessible).
+	"seed" can be used to alter the result predictably.
+	Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */
+	XXH_PUBLIC_API XXH32_hash_t XXH32(const void* input, size_t length, unsigned int seed);
+
+	/*======   Streaming   ======*/
+	typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+	XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+	XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+	XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+	XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed);
+	XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t* statePtr, const void* input, size_t length);
+	XXH_PUBLIC_API XXH32_hash_t  XXH32_digest(const XXH32_state_t* statePtr);
+
+	/*
+	* Streaming functions generate the xxHash of an input provided in multiple segments.
+	* Note that, for small input, they are slower than single-call functions, due to state management.
+	* For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+	*
+	* XXH state must first be allocated, using XXH*_createState() .
+	*
+	* Start a new hash by initializing state with a seed, using XXH*_reset().
+	*
+	* Then, feed the hash state by calling XXH*_update() as many times as necessary.
+	* The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+	*
+	* Finally, a hash value can be produced anytime, by using XXH*_digest().
+	* This function returns the nn-bits hash as an int or long long.
+	*
+	* It's still possible to continue inserting input into the hash state after a digest,
+	* and generate some new hashes later on, by calling again XXH*_digest().
+	*
+	* When done, free XXH state space if it was allocated dynamically.
+	*/
+
+	/*======   Canonical representation   ======*/
+
+	typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+	XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+	XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+	/* Default result type for XXH functions are primitive unsigned 32 and 64 bits.
+	* The canonical representation uses human-readable write convention, aka big-endian (large digits first).
+	* These functions allow transformation of hash result into and from its canonical format.
+	* This way, hash values can be written into a file / memory, and remain comparable on different systems and programs.
+	*/
+
+
+#ifndef XXH_NO_LONG_LONG
+	/*-**********************************************************************
+	*  64-bit hash
+	************************************************************************/
+	typedef unsigned long long XXH64_hash_t;
+
+	/*! XXH64() :
+	Calculate the 64-bit hash of sequence of length "len" stored at memory address "input".
+	"seed" can be used to alter the result predictably.
+	This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark).
+	*/
+	XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, unsigned long long seed);
+
+	/*======   Streaming   ======*/
+	typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+	XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+	XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+	XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+	XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed);
+	XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t* statePtr, const void* input, size_t length);
+	XXH_PUBLIC_API XXH64_hash_t  XXH64_digest(const XXH64_state_t* statePtr);
+
+	/*======   Canonical representation   ======*/
+	typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+	XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+	XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+#endif  /* XXH_NO_LONG_LONG */
+
+
+
+#ifdef XXH_STATIC_LINKING_ONLY
+
+	/* ================================================================================================
+	This section contains declarations which are not guaranteed to remain stable.
+	They may change in future versions, becoming incompatible with a different version of the library.
+	These declarations should only be used with static linking.
+	Never use them in association with dynamic linking !
+	=================================================================================================== */
+
+	/* These definitions are only present to allow
+	* static allocation of XXH state, on stack or in a struct for example.
+	* Never **ever** use members directly. */
+
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+
+	struct XXH32_state_s {
+		uint32_t total_len_32;
+		uint32_t large_len;
+		uint32_t v1;
+		uint32_t v2;
+		uint32_t v3;
+		uint32_t v4;
+		uint32_t mem32[4];
+		uint32_t memsize;
+		uint32_t reserved;   /* never read nor write, might be removed in a future version */
+	};   /* typedef'd to XXH32_state_t */
+
+	struct XXH64_state_s {
+		uint64_t total_len;
+		uint64_t v1;
+		uint64_t v2;
+		uint64_t v3;
+		uint64_t v4;
+		uint64_t mem64[4];
+		uint32_t memsize;
+		uint32_t reserved[2];          /* never read nor write, might be removed in a future version */
+	};   /* typedef'd to XXH64_state_t */
+
+# else
+
+	struct XXH32_state_s {
+		unsigned total_len_32;
+		unsigned large_len;
+		unsigned v1;
+		unsigned v2;
+		unsigned v3;
+		unsigned v4;
+		unsigned mem32[4];
+		unsigned memsize;
+		unsigned reserved;   /* never read nor write, might be removed in a future version */
+	};   /* typedef'd to XXH32_state_t */
+
+#   ifndef XXH_NO_LONG_LONG  /* remove 64-bit support */
+	struct XXH64_state_s {
+		unsigned long long total_len;
+		unsigned long long v1;
+		unsigned long long v2;
+		unsigned long long v3;
+		unsigned long long v4;
+		unsigned long long mem64[4];
+		unsigned memsize;
+		unsigned reserved[2];     /* never read nor write, might be removed in a future version */
+	};   /* typedef'd to XXH64_state_t */
+#    endif
+
+# endif
+
+
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  include "xxhash.c"   /* include xxhash function bodies as `static`, for inlining */
+#endif
+
+#endif /* XXH_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* XXHASH_H_5627135585666179 */
\ No newline at end of file
diff --git a/rpmp/include/xxhash/xxhash.hpp b/rpmp/include/xxhash/xxhash.hpp
new file mode 100644
index 00000000..81e82074
--- /dev/null
+++ b/rpmp/include/xxhash/xxhash.hpp
@@ -0,0 +1,719 @@
+#pragma once
+#include <cstdint>
+#include <cstring>
+#include <array>
+#include <type_traits>
+#include <cstdint>
+#include <vector>
+#include <string>
+
+#include <iostream>
+
+/*
+xxHash - Extremely Fast Hash algorithm
+Header File
+Copyright (C) 2012-2018, Yann Collet.
+Copyright (C) 2017-2018, Piotr Pliszka.
+All rights reserved.
+
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You can contact the author at :
+- xxHash source repository : https://github.com/Cyan4973/xxHash
+- xxHash C++ port repository : https://github.com/RedSpah/xxhash_cpp
+*/
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!XXH_FORCE_MEMORY_ACCESS :
+* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+* The below switch allow to select different access method for improved performance.
+* Method 0 (default) : use `memcpy()`. Safe and portable.
+* Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+*            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+* Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+*            It can generate buggy code on targets which do not support unaligned memory accesses.
+*            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+* See http://stackoverflow.com/a/32095106/646947 for details.
+* Prefer these methods in priority order (0 > 1 > 2)
+*/
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif defined(__INTEL_COMPILER) || (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+
+/*!XXH_FORCE_NATIVE_FORMAT :
+* By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
+* Results are therefore identical for little-endian and big-endian CPU.
+* This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+* Should endian-independence be of no importance for your application, you may set the #define below to 1,
+* to improve speed for Big-endian CPU.
+* This option has no impact on Little_Endian CPU.
+*/
+#if !defined(XXH_FORCE_NATIVE_FORMAT) || (XXH_FORCE_NATIVE_FORMAT == 0)  /* can be defined externally */
+#	define XXH_FORCE_NATIVE_FORMAT 0
+#	define XXH_CPU_LITTLE_ENDIAN 1
+#endif
+
+
+/*!XXH_FORCE_ALIGN_CHECK :
+* This is a minor performance trick, only useful with lots of very small keys.
+* It means : check for aligned/unaligned input.
+* The check costs one initial branch per hash;
+* set it to 0 when the input is guaranteed to be aligned,
+* or when alignment doesn't matter for performance.
+*/
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#	if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#		define XXH_FORCE_ALIGN_CHECK 0
+#	else
+#		define XXH_FORCE_ALIGN_CHECK 1
+#	endif
+#endif
+
+/*!XXH_CPU_LITTLE_ENDIAN :
+* This is a CPU endian detection macro, will be
+* automatically set to 1 (little endian) if XXH_FORCE_NATIVE_FORMAT
+* is left undefined, XXH_FORCE_NATIVE_FORMAT is defined to 0, or if an x86/x86_64 compiler macro is defined.
+* If left undefined, endianness will be determined at runtime, at the cost of a slight one-time overhead
+* and a larger overhead due to get_endian() not being constexpr.
+*/
+#ifndef XXH_CPU_LITTLE_ENDIAN
+#	if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#		define XXH_CPU_LITTLE_ENDIAN 1
+#	endif
+#endif
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+namespace xxh
+{
+	/* *************************************
+	*  Version
+	***************************************/
+	constexpr int cpp_version_major = 0;
+	constexpr int cpp_version_minor = 6;
+	constexpr int cpp_version_release = 5;
+	constexpr uint32_t version_number() { return cpp_version_major * 10000 + cpp_version_minor * 100 + cpp_version_release; }
+
+	namespace hash_t_impl
+	{
+		/* *************************************
+		*  Basic Types - Detail
+		***************************************/
+
+		using _hash32_underlying = uint32_t;
+		using _hash64_underlying = uint64_t;
+
+		template <size_t N>
+		struct hash_type { using type = void; };
+		template <>
+		struct hash_type<32> { using type = _hash32_underlying; };
+		template <>
+		struct hash_type<64> { using type = _hash64_underlying; };
+	}
+
+	/* *************************************
+	*  Basic Types - Public
+	***************************************/
+
+	template <size_t N>
+	using hash_t = typename hash_t_impl::hash_type<N>::type;
+	using hash32_t = hash_t<32>;
+	using hash64_t = hash_t<64>;
+
+	/* *************************************
+	*  Bit Functions - Public
+	***************************************/
+
+	namespace bit_ops
+	{
+		/* ****************************************
+		*  Intrinsics and Bit Operations
+		******************************************/
+
+#if defined(_MSC_VER)
+		inline uint32_t rotl32(uint32_t x, int32_t r) { return _rotl(x, r); }
+		inline uint64_t rotl64(uint64_t x, int32_t r) { return _rotl64(x, r); }
+#else
+		inline uint32_t rotl32(uint32_t x, int32_t r) { return ((x << r) | (x >> (32 - r))); }
+		inline uint64_t rotl64(uint64_t x, int32_t r) { return ((x << r) | (x >> (64 - r))); }
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+		inline uint32_t swap32(uint32_t x) { return _byteswap_ulong(x); }
+		inline uint64_t swap64(uint64_t x) { return _byteswap_uint64(x); }
+#elif XXH_GCC_VERSION >= 403
+		inline uint32_t swap32(uint32_t x) { return __builtin_bswap32(x); }
+		inline uint64_t swap64(uint64_t x) { return __builtin_bswap64(x); }
+#else
+		inline uint32_t swap32(uint32_t x) { return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) | ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff); }
+		inline uint64_t swap64(uint64_t x) { return ((x << 56) & 0xff00000000000000ULL) | ((x << 40) & 0x00ff000000000000ULL) | ((x << 24) & 0x0000ff0000000000ULL) | ((x << 8) & 0x000000ff00000000ULL) | ((x >> 8) & 0x00000000ff000000ULL) | ((x >> 24) & 0x0000000000ff0000ULL) | ((x >> 40) & 0x000000000000ff00ULL) | ((x >> 56) & 0x00000000000000ffULL); }
+#endif
+		template <size_t N>
+		inline hash_t<N> rotl(hash_t<N> n, int32_t r) {};
+
+		template <>
+		inline hash_t<32> rotl<32>(hash_t<32> n, int32_t r)
+		{
+			return rotl32(n, r);
+		};
+
+		template <>
+		inline hash_t<64> rotl<64>(hash_t<64> n, int32_t r)
+		{
+			return rotl64(n, r);
+		};
+
+		template <size_t N>
+		inline hash_t<N> swap(hash_t<N> n) {};
+
+		template <>
+		inline hash_t<32> swap<32>(hash_t<32> n)
+		{
+			return swap32(n);
+		};
+
+		template <>
+		inline hash_t<64> swap<64>(hash_t<64> n)
+		{
+			return swap64(n);
+		};
+	}
+
+	/* *************************************
+	*  Memory Functions - Public
+	***************************************/
+
+	enum class alignment : uint8_t { aligned, unaligned };
+	enum class endianness : uint8_t { big_endian = 0, little_endian = 1, unspecified = 2 };
+
+	namespace mem_ops
+	{
+		/* *************************************
+		*  Memory Access
+		***************************************/
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+		/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+		template <size_t N>
+		inline hash_t<N> read_unaligned(const void* memPtr) { return *(const hash_t<N>*)memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+		/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+		/* currently only defined for gcc and icc */
+		template <size_t N>
+		using unalign = union { hash_t<N> uval; } __attribute((packed));
+
+		template <size_t N>
+		inline hash_t<N> read_unaligned(const void* memPtr) { return ((const unalign*)memPtr)->uval; }
+#else
+
+		/* portable and safe solution. Generally efficient.
+		* see : http://stackoverflow.com/a/32095106/646947
+		*/
+		template <size_t N>
+		inline hash_t<N> read_unaligned(const void* memPtr)
+		{
+			hash_t<N> val;
+			memcpy(&val, memPtr, sizeof(val));
+			return val;
+		}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+		inline hash_t<32> read32(const void* memPtr) { return read_unaligned<32>(memPtr); }
+		inline hash_t<64> read64(const void* memPtr) { return read_unaligned<64>(memPtr); }
+
+		/* *************************************
+		*  Architecture Macros
+		***************************************/
+
+		/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+
+#ifndef XXH_CPU_LITTLE_ENDIAN
+
+		inline endianness get_endian(endianness endian)
+		{
+			static struct _dummy_t
+			{
+				std::array<endianness, 3> endian_lookup = { endianness::big_endian, endianness::little_endian, endianness::unspecified };
+				const int g_one = 1;
+				_dummy_t()
+				{
+					endian_lookup[2] = static_cast<endianness>(*(const char*)(&g_one));
+				}
+			} _dummy;
+
+			return _dummy.endian_lookup[(uint8_t)endian];
+		}
+
+		inline bool is_little_endian()
+		{
+			return get_endian(endianness::unspecified) == endianness::little_endian;
+		}
+
+#else
+		constexpr endianness get_endian(endianness endian)
+		{
+			constexpr std::array<endianness, 3> endian_lookup = { endianness::big_endian, endianness::little_endian, (XXH_CPU_LITTLE_ENDIAN) ? endianness::little_endian : endianness::big_endian };
+			return endian_lookup[static_cast<uint8_t>(endian)];
+		}
+
+		constexpr bool is_little_endian()
+		{
+			return get_endian(endianness::unspecified) == endianness::little_endian;
+		}
+
+#endif
+
+
+
+		/* ***************************
+		*  Memory reads
+		*****************************/
+
+
+		template <size_t N>
+		inline hash_t<N> readLE_align(const void* ptr, endianness endian, alignment align)
+		{
+			if (align == alignment::unaligned)
+			{
+				return endian == endianness::little_endian ? read_unaligned<N>(ptr) : bit_ops::swap<N>(read_unaligned<N>(ptr));
+			}
+			else
+			{
+				return endian == endianness::little_endian ? *reinterpret_cast<const hash_t<N>*>(ptr) : bit_ops::swap<N>(*reinterpret_cast<const hash_t<N>*>(ptr));
+			}
+		}
+
+		template <size_t N>
+		inline hash_t<N> readLE(const void* ptr, endianness endian)
+		{
+			return readLE_align<N>(ptr, endian, alignment::unaligned);
+		}
+
+		template <size_t N>
+		inline hash_t<N> readBE(const void* ptr)
+		{
+			return is_little_endian() ? bit_ops::swap<N>(read_unaligned<N>(ptr)) : read_unaligned<N>(ptr);
+		}
+
+		template <size_t N>
+		inline alignment get_alignment(const void* input)
+		{
+			return ((XXH_FORCE_ALIGN_CHECK) && ((reinterpret_cast<uintptr_t>(input) & ((N / 8) - 1)) == 0)) ? xxh::alignment::aligned : xxh::alignment::unaligned;
+		}
+	}
+
+	/* *******************************************************************
+	*  Hash functions
+	*********************************************************************/
+
+	namespace detail
+	{
+		/* *******************************************************************
+		*  Hash functions - Implementation
+		*********************************************************************/
+
+		constexpr static std::array<hash32_t, 5> primes32 = { 2654435761U, 2246822519U, 3266489917U, 668265263U, 374761393U };
+		constexpr static std::array<hash64_t, 5> primes64 = { 11400714785074694791ULL, 14029467366897019727ULL, 1609587929392839161ULL, 9650029242287828579ULL, 2870177450012600261ULL };
+
+		template <size_t N>
+		constexpr hash_t<N> PRIME(int32_t n) {};
+
+		template <>
+		constexpr hash32_t PRIME<32>(int32_t n)
+		{
+			return primes32[n - 1];
+		}
+
+		template <>
+		constexpr hash64_t PRIME<64>(int32_t n)
+		{
+			return primes64[n - 1];
+		}
+
+		template <size_t N>
+		inline hash_t<N> round(hash_t<N> seed, hash_t<N> input)
+		{
+			seed += input * PRIME<N>(2);
+			seed = bit_ops::rotl<N>(seed, ((N == 32) ? 13 : 31));
+			seed *= PRIME<N>(1);
+			return seed;
+		}
+
+		inline hash64_t mergeRound64(hash64_t acc, hash64_t val)
+		{
+			val = round<64>(0, val);
+			acc ^= val;
+			acc = acc * PRIME<64>(1) + PRIME<64>(4);
+			return acc;
+		}
+
+		template <size_t N>
+		inline void endian_align_sub_mergeround([[maybe_unused]] hash_t<N>& hash_ret, hash_t<N> v1, hash_t<N> v2, hash_t<N> v3, hash_t<N> v4) {};
+
+		template <>
+		inline void endian_align_sub_mergeround<64>(hash_t<64>& hash_ret, hash_t<64> v1, hash_t<64> v2, hash_t<64> v3, hash_t<64> v4)
+		{
+			hash_ret = mergeRound64(hash_ret, v1);
+			hash_ret = mergeRound64(hash_ret, v2);
+			hash_ret = mergeRound64(hash_ret, v3);
+			hash_ret = mergeRound64(hash_ret, v4);
+		}
+
+		template <size_t N>
+		inline hash_t<N> endian_align_sub_ending(hash_t<N> hash_ret, const uint8_t* p, const uint8_t* bEnd, xxh::endianness endian, xxh::alignment align) {};
+
+		template <>
+		inline hash_t<32> endian_align_sub_ending<32>(hash_t<32> hash_ret, const uint8_t* p, const uint8_t* bEnd, xxh::endianness endian, xxh::alignment align)
+		{
+			while ((p + 4) <= bEnd)
+			{
+				hash_ret += mem_ops::readLE_align<32>(p, endian, align) * PRIME<32>(3);
+				hash_ret = bit_ops::rotl<32>(hash_ret, 17) * PRIME<32>(4);
+				p += 4;
+			}
+
+			while (p < bEnd)
+			{
+				hash_ret += (*p) * PRIME<32>(5);
+				hash_ret = bit_ops::rotl<32>(hash_ret, 11) * PRIME<32>(1);
+				p++;
+			}
+
+			hash_ret ^= hash_ret >> 15;
+			hash_ret *= PRIME<32>(2);
+			hash_ret ^= hash_ret >> 13;
+			hash_ret *= PRIME<32>(3);
+			hash_ret ^= hash_ret >> 16;
+
+			return hash_ret;
+		}
+
+		template <>
+		inline hash_t<64> endian_align_sub_ending<64>(hash_t<64> hash_ret, const uint8_t* p, const uint8_t* bEnd, xxh::endianness endian, xxh::alignment align)
+		{
+			while (p + 8 <= bEnd)
+			{
+				const hash64_t k1 = round<64>(0, mem_ops::readLE_align<64>(p, endian, align));
+				hash_ret ^= k1;
+				hash_ret = bit_ops::rotl<64>(hash_ret, 27) * PRIME<64>(1) + PRIME<64>(4);
+				p += 8;
+			}
+
+			if (p + 4 <= bEnd)
+			{
+				hash_ret ^= static_cast<hash64_t>(mem_ops::readLE_align<32>(p, endian, align)) * PRIME<64>(1);
+				hash_ret = bit_ops::rotl<64>(hash_ret, 23) * PRIME<64>(2) + PRIME<64>(3);
+				p += 4;
+			}
+
+			while (p < bEnd)
+			{
+				hash_ret ^= (*p) * PRIME<64>(5);
+				hash_ret = bit_ops::rotl<64>(hash_ret, 11) * PRIME<64>(1);
+				p++;
+			}
+
+			hash_ret ^= hash_ret >> 33;
+			hash_ret *= PRIME<64>(2);
+			hash_ret ^= hash_ret >> 29;
+			hash_ret *= PRIME<64>(3);
+			hash_ret ^= hash_ret >> 32;
+
+			return hash_ret;
+		}
+
+		template <size_t N>
+		inline hash_t<N> endian_align(const void* input, size_t len, hash_t<N> seed, xxh::endianness endian, xxh::alignment align)
+		{
+			static_assert(!(N != 32 && N != 64), "You can only call endian_align in 32 or 64 bit mode.");
+
+			const uint8_t* p = static_cast<const uint8_t*>(input);
+			const uint8_t* bEnd = p + len;
+			hash_t<N> hash_ret;
+
+			if (len >= (N / 2))
+			{
+				const uint8_t* const limit = bEnd - (N / 2);
+				hash_t<N> v1 = seed + PRIME<N>(1) + PRIME<N>(2);
+				hash_t<N> v2 = seed + PRIME<N>(2);
+				hash_t<N> v3 = seed + 0;
+				hash_t<N> v4 = seed - PRIME<N>(1);
+
+				do
+				{
+					v1 = round<N>(v1, mem_ops::readLE_align<N>(p, endian, align)); p += (N / 8);
+					v2 = round<N>(v2, mem_ops::readLE_align<N>(p, endian, align)); p += (N / 8);
+					v3 = round<N>(v3, mem_ops::readLE_align<N>(p, endian, align)); p += (N / 8);
+					v4 = round<N>(v4, mem_ops::readLE_align<N>(p, endian, align)); p += (N / 8);
+				} while (p <= limit);
+
+				hash_ret = bit_ops::rotl<N>(v1, 1) + bit_ops::rotl<N>(v2, 7) + bit_ops::rotl<N>(v3, 12) + bit_ops::rotl<N>(v4, 18);
+
+				endian_align_sub_mergeround<N>(hash_ret, v1, v2, v3, v4);
+			}
+			else { hash_ret = seed + PRIME<N>(5); }
+
+			hash_ret += static_cast<hash_t<N>>(len);
+
+			return endian_align_sub_ending<N>(hash_ret, p, bEnd, endian, align);
+		}
+	}
+
+	template <size_t N>
+	hash_t<N> xxhash(const void* input, size_t len, hash_t<N> seed = 0, endianness endian = endianness::unspecified)
+	{
+		static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode.");
+		return detail::endian_align<N>(input, len, seed, mem_ops::get_endian(endian), mem_ops::get_alignment<N>(input));
+	}
+
+	template <size_t N, typename T>
+	hash_t<N> xxhash(const std::basic_string<T>& input, hash_t<N> seed = 0, endianness endian = endianness::unspecified)
+	{
+		static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode.");
+		return detail::endian_align<N>(static_cast<const void*>(input.data()), input.length() * sizeof(T), seed, mem_ops::get_endian(endian), mem_ops::get_alignment<N>(static_cast<const void*>(input.data())));
+	}
+
+	template <size_t N, typename ContiguousIterator>
+	hash_t<N> xxhash(ContiguousIterator begin, ContiguousIterator end, hash_t<N> seed = 0, endianness endian = endianness::unspecified)
+	{
+		static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode.");
+		using T = typename std::decay_t<decltype(*end)>;
+		return detail::endian_align<N>(static_cast<const void*>(&*begin), (end - begin) * sizeof(T), seed, mem_ops::get_endian(endian), mem_ops::get_alignment<N>(static_cast<const void*>(&*begin)));
+	}
+
+	template <size_t N, typename T>
+	hash_t<N> xxhash(const std::vector<T>& input, hash_t<N> seed = 0, endianness endian = endianness::unspecified)
+	{
+		static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode.");
+		return detail::endian_align<N>(static_cast<const void*>(input.data()), input.size() * sizeof(T), seed, mem_ops::get_endian(endian), mem_ops::get_alignment<N>(static_cast<const void*>(input.data())));
+	}
+
+	template <size_t N, typename T, size_t AN>
+	hash_t<N> xxhash(const std::array<T, AN>& input, hash_t<N> seed = 0, endianness endian = endianness::unspecified)
+	{
+		static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode.");
+		return detail::endian_align<N>(static_cast<const void*>(input.data()), AN * sizeof(T), seed, mem_ops::get_endian(endian), mem_ops::get_alignment<N>(static_cast<const void*>(input.data())));
+	}
+
+	template <size_t N, typename T>
+	hash_t<N> xxhash(const std::initializer_list<T>& input, hash_t<N> seed = 0, endianness endian = endianness::unspecified)
+	{
+		static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode.");
+		return detail::endian_align<N>(static_cast<const void*>(input.begin()), input.size() * sizeof(T), seed, mem_ops::get_endian(endian), mem_ops::get_alignment<N>(static_cast<const void*>(input.begin())));
+	}
+
+
+	/* *******************************************************************
+	*  Hash streaming
+	*********************************************************************/
+	enum class error_code : uint8_t { ok = 0, error };
+
+	template <size_t N>
+	class hash_state_t {
+
+		uint64_t total_len = 0;
+		hash_t<N> v1 = 0, v2 = 0, v3 = 0, v4 = 0;
+		std::array<hash_t<N>, 4> mem = {{ 0,0,0,0 }};
+		uint32_t memsize = 0;
+
+		inline error_code _update_impl(const void* input, size_t length, endianness endian)
+		{
+			const uint8_t* p = reinterpret_cast<const uint8_t*>(input);
+			const uint8_t* const bEnd = p + length;
+
+			if (!input) { return xxh::error_code::error; }
+
+			total_len += length;
+
+			if (memsize + length < (N / 2))
+			{   /* fill in tmp buffer */
+				memcpy(reinterpret_cast<uint8_t*>(mem.data()) + memsize, input, length);
+				memsize += static_cast<uint32_t>(length);
+				return error_code::ok;
+			}
+
+			if (memsize)
+			{   /* some data left from previous update */
+				memcpy(reinterpret_cast<uint8_t*>(mem.data()) + memsize, input, (N / 2) - memsize);
+
+				const hash_t<N>* ptr = mem.data();
+				v1 = detail::round<N>(v1, mem_ops::readLE<N>(ptr, endian)); ptr++;
+				v2 = detail::round<N>(v2, mem_ops::readLE<N>(ptr, endian)); ptr++;
+				v3 = detail::round<N>(v3, mem_ops::readLE<N>(ptr, endian)); ptr++;
+				v4 = detail::round<N>(v4, mem_ops::readLE<N>(ptr, endian));
+
+				p += (N / 2) - memsize;
+				memsize = 0;
+			}
+
+			if (p <= bEnd - (N / 2))
+			{
+				const uint8_t* const limit = bEnd - (N / 2);
+
+				do
+				{
+					v1 = detail::round<N>(v1, mem_ops::readLE<N>(p, endian)); p += (N / 8);
+					v2 = detail::round<N>(v2, mem_ops::readLE<N>(p, endian)); p += (N / 8);
+					v3 = detail::round<N>(v3, mem_ops::readLE<N>(p, endian)); p += (N / 8);
+					v4 = detail::round<N>(v4, mem_ops::readLE<N>(p, endian)); p += (N / 8);
+				} while (p <= limit);
+			}
+
+			if (p < bEnd)
+			{
+				memcpy(mem.data(), p, static_cast<size_t>(bEnd - p));
+				memsize = static_cast<uint32_t>(bEnd - p);
+			}
+
+			return error_code::ok;
+		}
+
+		inline hash_t<N> _digest_impl(endianness endian) const
+		{
+			const uint8_t* p = reinterpret_cast<const uint8_t*>(mem.data());
+			const uint8_t* const bEnd = reinterpret_cast<const uint8_t*>(mem.data()) + memsize;
+			hash_t<N> hash_ret;
+
+			if (total_len > (N / 2))
+			{
+				hash_ret = bit_ops::rotl<N>(v1, 1) + bit_ops::rotl<N>(v2, 7) + bit_ops::rotl<N>(v3, 12) + bit_ops::rotl<N>(v4, 18);
+
+				detail::endian_align_sub_mergeround<N>(hash_ret, v1, v2, v3, v4);
+			}
+			else { hash_ret = v3 + detail::PRIME<N>(5); }
+
+			hash_ret += static_cast<hash_t<N>>(total_len);
+
+			return detail::endian_align_sub_ending<N>(hash_ret, p, bEnd, endian, alignment::unaligned);
+		}
+
+	public:
+		hash_state_t(hash_t<N> seed = 0)
+		{
+			static_assert(!(N != 32 && N != 64), "You can only stream hashing in 32 or 64 bit mode.");
+			v1 = seed + detail::PRIME<N>(1) + detail::PRIME<N>(2);
+			v2 = seed + detail::PRIME<N>(2);
+			v3 = seed + 0;
+			v4 = seed - detail::PRIME<N>(1);
+		};
+
+		hash_state_t operator=(hash_state_t<N>& other)
+		{
+			memcpy(this, other, sizeof(hash_state_t<N>));
+		}
+
+		error_code reset(hash_t<N> seed = 0)
+		{
+			memset(this, 0, sizeof(hash_state_t<N>));
+			v1 = seed + detail::PRIME<N>(1) + detail::PRIME<N>(2);
+			v2 = seed + detail::PRIME<N>(2);
+			v3 = seed + 0;
+			v4 = seed - detail::PRIME<N>(1);
+			return error_code::ok;
+		}
+
+		error_code update(const void* input, size_t length, endianness endian = endianness::unspecified)
+		{
+			return _update_impl(input, length, mem_ops::get_endian(endian));
+		}
+
+		template <typename T>
+		error_code update(const std::basic_string<T>& input, endianness endian = endianness::unspecified)
+		{
+			return _update_impl(static_cast<const void*>(input.data()), input.length() * sizeof(T), mem_ops::get_endian(endian));
+		}
+
+		template <typename ContiguousIterator>
+		error_code update(ContiguousIterator begin, ContiguousIterator end, endianness endian = endianness::unspecified)
+		{
+			using T = typename std::decay_t<decltype(*end)>;
+			return _update_impl(static_cast<const void*>(&*begin), (end - begin) * sizeof(T), mem_ops::get_endian(endian));
+		}
+
+		template <typename T>
+		error_code update(const std::vector<T>& input, endianness endian = endianness::unspecified)
+		{
+			return _update_impl(static_cast<const void*>(input.data()), input.size() * sizeof(T), mem_ops::get_endian(endian));
+		}
+
+		template <typename T, size_t AN>
+		error_code update(const std::array<T, AN>& input, endianness endian = endianness::unspecified)
+		{
+			return _update_impl(static_cast<const void*>(input.data()), AN * sizeof(T), mem_ops::get_endian(endian));
+		}
+
+		template <typename T>
+		error_code update(const std::initializer_list<T>& input, endianness endian = endianness::unspecified)
+		{
+			return _update_impl(static_cast<const void*>(input.begin()), input.size() * sizeof(T), mem_ops::get_endian(endian));
+		}
+
+		hash_t<N> digest(endianness endian = endianness::unspecified)
+		{
+			return _digest_impl(mem_ops::get_endian(endian));
+		}
+	};
+
+	using hash_state32_t = hash_state_t<32>;
+	using hash_state64_t = hash_state_t<64>;
+
+
+	/* *******************************************************************
+	*  Canonical
+	*********************************************************************/
+
+	template <size_t N>
+	struct canonical_t
+	{
+		std::array<uint8_t, N / 8> digest;\
+
+
+
+		canonical_t(hash_t<N> hash)
+		{
+			if (mem_ops::is_little_endian()) { hash = bit_ops::swap<N>(hash); }
+			memcpy(digest.data(), &hash, sizeof(canonical_t<N>));
+		}
+
+		hash_t<N> get_hash() const
+		{
+			return mem_ops::readBE<N>(&digest);
+		}
+	};
+
+	using canonical32_t = canonical_t<32>;
+	using canonical64_t = canonical_t<64>;
+}
diff --git a/rpmp/main.cc b/rpmp/main.cc
new file mode 100644
index 00000000..c8fe9d16
--- /dev/null
+++ b/rpmp/main.cc
@@ -0,0 +1,42 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/main.cc
+ * Path: /mnt/spark-pmof/tool/rpmp
+ * Created Date: Thursday, November 7th 2019, 3:48:52 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#include <memory>
+
+#include "pmpool/Config.h"
+#include "pmpool/DataServer.h"
+#include "pmpool/Base.h"
+#include "pmpool/Log.h"
+
+/**
+ * @brief program entry of RPMP server
+ * @param argc 
+ * @param argv 
+ * @return int 
+ */
+int ServerMain(int argc, char **argv) {
+  /// initialize Config class
+  std::shared_ptr<Config> config = std::make_shared<Config>();
+  CHK_ERR("config init", config->init(argc, argv));
+  /// initialize Log class
+  std::shared_ptr<Log> log = std::make_shared<Log>(config.get());
+  /// initialize DataServer class
+  std::shared_ptr<DataServer> dataServer =
+      std::make_shared<DataServer>(config.get(), log.get());
+  log->get_file_log()->info("start to initialize data server.");
+  CHK_ERR("data server init", dataServer->init());
+  log->get_file_log()->info("data server initailized.");
+  dataServer->wait();
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  ServerMain(argc, argv);
+  return 0;
+}
diff --git a/rpmp/pmpool/Allocator.h b/rpmp/pmpool/Allocator.h
new file mode 100644
index 00000000..95cbe228
--- /dev/null
+++ b/rpmp/pmpool/Allocator.h
@@ -0,0 +1,51 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Allocator.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Monday, December 9th 2019, 9:06:37 am
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#ifndef PMPOOL_ALLOCATOR_H_
+#define PMPOOL_ALLOCATOR_H_
+
+#include <stdint.h>
+
+#include <string>
+
+class Chunk;
+
+using std::string;
+
+typedef uint64_t ptr_t;
+
+#define TO_GLOB(addr, base, wid) \
+  ((ptr_t)(addr) - (ptr_t)(base) + ((ptr_t)(wid) << 48))
+#define GET_WID(global_address) ((ptr_t)(global_address) >> 48)
+
+struct Addr {
+  uint32_t aid;
+  uint64_t offset;
+  uint64_t size;
+};
+
+struct DiskInfo {
+  DiskInfo(string& path_, uint64_t size_) : path(path_), size(size_) {}
+  string path;
+  uint64_t size;
+};
+
+class Allocator {
+ public:
+  virtual int init() = 0;
+  virtual uint64_t allocate_and_write(uint64_t buffer_size,
+                                      const char* content = nullptr) = 0;
+  virtual int write(uint64_t address, const char* content, uint64_t size) = 0;
+  virtual int release(uint64_t address) = 0;
+  virtual int release_all() = 0;
+  virtual int dump_all() = 0;
+  virtual uint64_t get_virtual_address(uint64_t address) = 0;
+  virtual Chunk* get_rma_chunk() = 0;
+};
+#endif  // PMPOOL_ALLOCATOR_H_
diff --git a/rpmp/pmpool/AllocatorProxy.h b/rpmp/pmpool/AllocatorProxy.h
new file mode 100644
index 00000000..cddc5830
--- /dev/null
+++ b/rpmp/pmpool/AllocatorProxy.h
@@ -0,0 +1,152 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/AllocatorProxy.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Tuesday, December 10th 2019, 12:53:48 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#ifndef PMPOOL_ALLOCATORPROXY_H_
+#define PMPOOL_ALLOCATORPROXY_H_
+
+#include <atomic>
+#include <memory>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include "Allocator.h"
+#include "Config.h"
+#include "DataServer.h"
+#include "Log.h"
+#include "PmemAllocator.h"
+#include "Base.h"
+
+using std::atomic;
+using std::make_shared;
+using std::unordered_map;
+using std::string;
+using std::vector;
+
+/**
+ * @brief Allocator proxy schedule faily to guarantee event to be assigned to
+ * different allocators.
+ *
+ */
+class AllocatorProxy {
+ public:
+  AllocatorProxy() = delete;
+  AllocatorProxy(Config *config, Log *log, NetworkServer *networkServer)
+      : config_(config), log_(log) {
+    vector<string> paths = config_->get_pool_paths();
+    vector<uint64_t> sizes = config_->get_pool_sizes();
+    assert(paths.size() == sizes.size());
+    for (int i = 0; i < paths.size(); i++) {
+      DiskInfo *diskInfo = new DiskInfo(paths[i], sizes[i]);
+      diskInfos_.push_back(diskInfo);
+      allocators_.push_back(
+          new PmemObjAllocator(log_, diskInfo, networkServer, i));
+    }
+  }
+
+  ~AllocatorProxy() {
+    for (int i = 0; i < config_->get_pool_paths().size(); i++) {
+      delete allocators_[i];
+      delete diskInfos_[i];
+    }
+    allocators_.clear();
+    diskInfos_.clear();
+  }
+
+  int init() {
+    for (int i = 0; i < diskInfos_.size(); i++) {
+      allocators_[i]->init();
+    }
+    return 0;
+  }
+
+  uint64_t allocate_and_write(uint64_t size, const char *content = nullptr,
+                              int index = -1) {
+    uint64_t addr = 0;
+    if (index < 0) {
+      int random_index = buffer_id_++ % diskInfos_.size();
+      addr = allocators_[random_index]->allocate_and_write(size, content);
+    } else {
+      addr = allocators_[index % diskInfos_.size()]->allocate_and_write(
+          size, content);
+    }
+  }
+
+  int write(uint64_t address, const char *content, uint64_t size) {
+    uint32_t wid = GET_WID(address);
+    return allocators_[wid]->write(address, content, size);
+  }
+
+  int release(uint64_t address) {
+    uint32_t wid = GET_WID(address);
+    return allocators_[wid]->release(address);
+  }
+
+  int release_all() {
+    for (int i = 0; i < diskInfos_.size(); i++) {
+      allocators_[i]->release_all();
+    }
+    return 0;
+  }
+
+  int dump_all() {
+    for (int i = 0; i < diskInfos_.size(); i++) {
+      allocators_[i]->dump_all();
+    }
+    return 0;
+  }
+
+  uint64_t get_virtual_address(uint64_t address) {
+    uint32_t wid = GET_WID(address);
+    return allocators_[wid]->get_virtual_address(address);
+  }
+
+  Chunk *get_rma_chunk(uint64_t address) {
+    uint32_t wid = GET_WID(address);
+    return allocators_[wid]->get_rma_chunk();
+  }
+
+  void cache_chunk(uint64_t key, uint64_t address, uint64_t size) {
+    block_meta bm = {address, size};
+    cache_chunk(key, bm);
+  }
+
+  void cache_chunk(uint64_t key, block_meta bm) {
+    if (kv_meta_map.count(key)) {
+      kv_meta_map[key].push_back(bm);
+    } else {
+      vector<block_meta> bml;
+      bml.push_back(bm);
+      kv_meta_map[key] = bml;
+    }
+  }
+
+  vector<block_meta> get_cached_chunk(uint64_t key) {
+    if (kv_meta_map.count(key)) {
+      return kv_meta_map[key];
+    }
+    return vector<block_meta>();
+  }
+
+  void del_chunk(uint64_t key) {
+    if (kv_meta_map.count(key))  {
+      kv_meta_map.erase(key);
+    }
+  }
+
+ private:
+  Config *config_;
+  Log *log_;
+  vector<Allocator *> allocators_;
+  vector<DiskInfo *> diskInfos_;
+  atomic<uint64_t> buffer_id_{0};
+  unordered_map<uint64_t, vector<block_meta>> kv_meta_map;
+};
+
+#endif  // PMPOOL_ALLOCATORPROXY_H_
diff --git a/rpmp/pmpool/Base.h b/rpmp/pmpool/Base.h
new file mode 100644
index 00000000..685c0d60
--- /dev/null
+++ b/rpmp/pmpool/Base.h
@@ -0,0 +1,54 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/fb/Encoder.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool/fb
+ * Created Date: Friday, December 27th 2019, 3:05:51 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#ifndef PMPOOL_BASE_H_
+#define PMPOOL_BASE_H_
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <iostream>
+
+#define CHK_ERR(function_name, result)                              \
+  {                                                                 \
+    if (result) {                                                   \
+      fprintf(stderr, "%s: %s\n", function_name, strerror(result)); \
+      return result;                                                \
+    }                                                               \
+  }
+
+struct RequestMsg {
+  uint32_t type;
+  uint64_t rid;
+  uint64_t address;
+  uint64_t src_address;
+  uint64_t src_rkey;
+  uint64_t size;
+  uint64_t key;
+};
+
+struct RequestReplyMsg {
+  uint32_t type;
+  uint32_t success;
+  uint64_t rid;
+  uint64_t address;
+  uint64_t size;
+  uint64_t key;
+};
+
+struct block_meta {
+  block_meta() : block_meta(0, 0) {}
+  block_meta(uint64_t _address, uint64_t _size)
+      : address(_address), size(_size) {}
+  uint64_t address;
+  uint64_t size;
+};
+
+#endif  // PMPOOL_BASE_H_
diff --git a/rpmp/pmpool/CMakeLists.txt b/rpmp/pmpool/CMakeLists.txt
new file mode 100644
index 00000000..64c82108
--- /dev/null
+++ b/rpmp/pmpool/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_library(pmpool SHARED DataServer.cc Protocol.cc Event.cc NetworkServer.cc hash/xxhash.cc client/PmPoolClient.cc client/NetworkClient.cc client/native/com_intel_rpmp_PmPoolClient.cc)
+target_link_libraries(pmpool LINK_PUBLIC ${Boost_LIBRARIES} hpnl pmemobj)
+set_target_properties(pmpool PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
+
+if(UNIX AND NOT APPLE)
+  set(LINUX TRUE)
+endif()
+
+if(APPLE)
+  set(JNI_INCLUDE "$ENV{JAVA_HOME}/include" "$ENV{JAVA_HOME}/include/darwin")
+endif()
+if(LINUX)
+  set(JNI_INCLUDE "$ENV{JAVA_HOME}/include" "$ENV{JAVA_HOME}/include/linux")
+endif()
+include_directories(${JNI_INCLUDE})
+
+set(CMAKE_INSTALL_PREFIX "/usr/local")
+install(TARGETS pmpool LIBRARY DESTINATION lib ARCHIVE DESTINATION lib)
diff --git a/rpmp/pmpool/Common.h b/rpmp/pmpool/Common.h
new file mode 100644
index 00000000..56c6fa0b
--- /dev/null
+++ b/rpmp/pmpool/Common.h
@@ -0,0 +1,28 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Common.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Wednesday, January 15th 2020, 7:44:44 pm
+ * Author: root
+ *
+ * Copyright (c) 2020 Your Company
+ */
+
+#ifndef PMPOOL_COMMON_H_
+#define PMPOOL_COMMON_H_
+
+#include <atomic>
+
+class spin_mutex {
+ public:
+  std::atomic_flag flag = ATOMIC_FLAG_INIT;
+  spin_mutex() = default;
+  spin_mutex(const spin_mutex &) = delete;
+  spin_mutex &operator=(const spin_mutex &) = delete;
+  void lock() {
+    while (flag.test_and_set(std::memory_order_acquire)) {
+    }
+  }
+  void unlock() { flag.clear(std::memory_order_release); }
+};
+
+#endif  // PMPOOL_COMMON_H_
diff --git a/rpmp/pmpool/Config.h b/rpmp/pmpool/Config.h
new file mode 100644
index 00000000..2a1b9526
--- /dev/null
+++ b/rpmp/pmpool/Config.h
@@ -0,0 +1,139 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Config.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Thursday, November 7th 2019, 3:48:52 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#ifndef PMPOOL_CONFIG_H_
+#define PMPOOL_CONFIG_H_
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <boost/program_options.hpp>
+
+using boost::program_options::error;
+using boost::program_options::options_description;
+using boost::program_options::value;
+using boost::program_options::variables_map;
+using std::string;
+using std::vector;
+
+/**
+ * @brief This class represents the current RPMP configuration.
+ *
+ */
+class Config {
+ public:
+  int init(int argc, char **argv) {
+    try {
+      options_description desc{"Options"};
+      desc.add_options()("help,h", "Help screen")(
+          "address,a", value<string>()->default_value("172.168.0.40"),
+          "set the rdma server address")(
+          "port,p", value<string>()->default_value("12346"),
+          "set the rdma server port")("network_buffer_size,nbs",
+                                      value<int>()->default_value(65536),
+                                      "set network buffer size")(
+          "network_buffer_num,nbn", value<int>()->default_value(16),
+          "set network buffer number")("network_worker,nw",
+                                       value<int>()->default_value(1),
+                                       "set network wroker number")(
+          "paths,ps", value<vector<string>>(), "set memory pool path")(
+          "sizes,ss", value<vector<int>>(), "set memory pool size")(
+          "log,l", value<string>()->default_value("/tmp/rpmp.log"),
+          "set rpmp log file path")("log_level,ll",
+                                    value<string>()->default_value("warn"),
+                                    "set log level");
+
+      variables_map vm;
+      store(parse_command_line(argc, argv, desc), vm);
+      notify(vm);
+
+      if (vm.count("help")) {
+        std::cout << desc << '\n';
+        return -1;
+      }
+      set_ip(vm["address"].as<string>());
+      set_port(vm["port"].as<string>());
+      set_network_buffer_size(vm["network_buffer_size"].as<int>());
+      set_network_buffer_num(vm["network_buffer_num"].as<int>());
+      set_network_worker_num(vm["network_worker"].as<int>());
+      pool_paths_.push_back("/dev/dax0.0");
+      pool_paths_.push_back("/dev/dax0.1");
+      pool_paths_.push_back("/dev/dax1.0");
+      pool_paths_.push_back("/dev/dax1.1");
+      sizes_.push_back(126833655808L);
+      sizes_.push_back(126833655808L);
+      sizes_.push_back(126833655808L);
+      sizes_.push_back(126833655808L);
+      affinities_.push_back(2);
+      affinities_.push_back(41);
+      affinities_.push_back(22);
+      affinities_.push_back(60);
+      set_log_path(vm["log"].as<string>());
+      set_log_level(vm["log_level"].as<string>());
+    } catch (const error &ex) {
+      std::cerr << ex.what() << '\n';
+    }
+    return 0;
+  }
+
+  string get_ip() { return ip_; }
+  void set_ip(string ip) { ip_ = ip; }
+
+  string get_port() { return port_; }
+  void set_port(string port) { port_ = port; }
+
+  int get_network_buffer_size() { return network_buffer_size_; }
+  void set_network_buffer_size(int network_buffer_size) {
+    network_buffer_size_ = network_buffer_size;
+  }
+
+  int get_network_buffer_num() { return network_buffer_num_; }
+  void set_network_buffer_num(int network_buffer_num) {
+    network_buffer_num_ = network_buffer_num;
+  }
+
+  int get_network_worker_num() { return network_worker_num_; }
+  void set_network_worker_num(int network_worker_num) {
+    network_worker_num_ = network_worker_num;
+  }
+
+  vector<string> &get_pool_paths() { return pool_paths_; }
+  void set_pool_paths(const vector<string> &pool_paths) {
+    pool_paths_ = pool_paths;
+  }
+
+  std::vector<uint64_t> get_pool_sizes() { return sizes_; }
+  void set_pool_sizes(vector<uint64_t> sizes) { sizes_ = sizes; }
+
+  int get_pool_size() { return sizes_.size(); }
+
+  std::vector<uint64_t> get_affinities_() { return affinities_; }
+
+  string get_log_path() { return log_path_; }
+  void set_log_path(string log_path) { log_path_ = log_path; }
+
+  string get_log_level() { return log_level_; }
+  void set_log_level(string log_level) { log_level_ = log_level; }
+
+ private:
+  string ip_;
+  string port_;
+  int network_buffer_size_;
+  int network_buffer_num_;
+  int network_worker_num_;
+  vector<string> pool_paths_;
+  vector<uint64_t> sizes_;
+  vector<uint64_t> affinities_;
+  string log_path_;
+  string log_level_;
+};
+
+#endif  // PMPOOL_CONFIG_H_
diff --git a/rpmp/pmpool/DataServer.cc b/rpmp/pmpool/DataServer.cc
new file mode 100644
index 00000000..47859c50
--- /dev/null
+++ b/rpmp/pmpool/DataServer.cc
@@ -0,0 +1,42 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/DataServer.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Thursday, November 7th 2019, 3:48:52 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#include "pmpool/DataServer.h"
+
+#include "AllocatorProxy.h"
+#include "Config.h"
+#include "Digest.h"
+#include "NetworkServer.h"
+#include "Protocol.h"
+#include "Log.h"
+
+DataServer::DataServer(Config *config, Log *log) : config_(config), log_(log) {}
+
+int DataServer::init() {
+  networkServer_ = std::make_shared<NetworkServer>(config_, log_);
+  CHK_ERR("network server init", networkServer_->init());
+  log_->get_file_log()->info("network server initialized.");
+
+  allocatorProxy_ =
+      std::make_shared<AllocatorProxy>(config_, log_, networkServer_.get());
+  CHK_ERR("allocator proxy init", allocatorProxy_->init());
+  log_->get_file_log()->info("allocator proxy initialized.");
+
+  protocol_ = std::make_shared<Protocol>(config_, log_, networkServer_.get(),
+                                         allocatorProxy_.get());
+  CHK_ERR("protocol init", protocol_->init());
+  log_->get_file_log()->info("protocol initialized.");
+
+  networkServer_->start();
+  log_->get_file_log()->info("network server started.");
+  log_->get_console_log()->info("RPMP started...");
+  return 0;
+}
+
+void DataServer::wait() { networkServer_->wait(); }
diff --git a/rpmp/pmpool/DataServer.h b/rpmp/pmpool/DataServer.h
new file mode 100644
index 00000000..14e70ec1
--- /dev/null
+++ b/rpmp/pmpool/DataServer.h
@@ -0,0 +1,45 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/DataServer.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Thursday, November 7th 2019, 3:48:52 pm
+ * Author: root
+ * 
+ * Copyright (c) 2019 Intel
+ */
+
+#ifndef PMPOOL_DATASERVER_H_
+#define PMPOOL_DATASERVER_H_
+
+#include <HPNL/Server.h>
+#include <HPNL/ChunkMgr.h>
+
+#include <memory>
+
+class Config;
+class Protocol;
+class Digest;
+class DataList;
+class AllocatorProxy;
+class NetworkServer;
+class Log;
+
+/**
+ * @brief DataServer is designed as distributed remote memory pool.
+ * DataServer on every node communicated with each other to guarantee data consistency.
+ * 
+ */
+class DataServer {
+ public:
+  DataServer() = delete;
+  explicit DataServer(Config* config, Log* log);
+  int init();
+  void wait();
+ private:
+  Config* config_;
+  Log* log_;
+  std::shared_ptr<NetworkServer> networkServer_;
+  std::shared_ptr<AllocatorProxy> allocatorProxy_;
+  std::shared_ptr<Protocol> protocol_;
+};
+
+#endif  // PMPOOL_DATASERVER_H_
diff --git a/rpmp/pmpool/Digest.h b/rpmp/pmpool/Digest.h
new file mode 100644
index 00000000..cd7b5c7f
--- /dev/null
+++ b/rpmp/pmpool/Digest.h
@@ -0,0 +1,30 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Digest.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Thursday, November 7th 2019, 3:48:52 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#ifndef PMPOOL_DIGEST_H_
+#define PMPOOL_DIGEST_H_
+
+#include <cstdint>
+#include <string>
+
+#include <HPNL/ChunkMgr.h>
+#include "xxhash/xxhash.h"
+#include "xxhash/xxhash.hpp"
+
+using std::string;
+
+class Digest {
+ public:
+  Digest() = default;
+  static void computeKeyHash(const string &key, uint64_t *hash) {
+    *hash = xxh::xxhash<64>(key);
+  }
+};
+
+#endif  // PMPOOL_DIGEST_H_
diff --git a/rpmp/pmpool/Event.cc b/rpmp/pmpool/Event.cc
new file mode 100644
index 00000000..2415c58d
--- /dev/null
+++ b/rpmp/pmpool/Event.cc
@@ -0,0 +1,116 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Request.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Thursday, December 12th 2019, 1:36:18 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#include "pmpool/Event.h"
+
+#include "pmpool/buffer/CircularBuffer.h"
+
+Request::Request(RequestContext requestContext)
+    : data_(nullptr), size_(0), requestContext_(requestContext) {}
+
+Request::Request(char *data, uint64_t size, Connection *con) : size_(size) {
+  data_ = static_cast<char *>(std::malloc(size));
+  memcpy(data_, data, size_);
+  requestContext_.con = con;
+}
+
+Request::~Request() {
+  if (data_ != nullptr) {
+    std::free(data_);
+    data_ = nullptr;
+  }
+}
+
+RequestContext &Request::get_rc() { return requestContext_; }
+
+void Request::encode() {
+  OpType rt = requestContext_.type;
+  assert(rt == ALLOC || rt == FREE || rt == WRITE || rt == READ);
+  requestMsg_.type = requestContext_.type;
+  requestMsg_.rid = requestContext_.rid;
+  requestMsg_.address = requestContext_.address;
+  requestMsg_.src_address = requestContext_.src_address;
+  requestMsg_.src_rkey = requestContext_.src_rkey;
+  requestMsg_.size = requestContext_.size;
+  requestMsg_.key = requestContext_.key;
+
+  size_ = sizeof(requestMsg_);
+  data_ = static_cast<char *>(std::malloc(size_));
+  memcpy(data_, &requestMsg_, size_);
+}
+
+void Request::decode() {
+  assert(size_ == sizeof(requestMsg_));
+  memcpy(&requestMsg_, data_, size_);
+  requestContext_.type = (OpType)requestMsg_.type;
+  requestContext_.rid = requestMsg_.rid;
+  requestContext_.address = requestMsg_.address;
+  requestContext_.src_address = requestMsg_.src_address;
+  requestContext_.src_rkey = requestMsg_.src_rkey;
+  requestContext_.size = requestMsg_.size;
+  requestContext_.key = requestMsg_.key;
+}
+
+RequestReply::RequestReply(RequestReplyContext requestReplyContext)
+    : data_(nullptr), size_(0), requestReplyContext_(requestReplyContext) {}
+
+RequestReply::RequestReply(char *data, uint64_t size, Connection *con)
+    : size_(size) {
+  data_ = static_cast<char *>(std::malloc(size_));
+  memcpy(data_, data, size_);
+  requestReplyContext_.con = con;
+}
+
+RequestReply::~RequestReply() {
+  if (data_ != nullptr) {
+    std::free(data_);
+    data_ = nullptr;
+  }
+}
+
+RequestReplyContext &RequestReply::get_rrc() { return requestReplyContext_; }
+
+void RequestReply::encode() {
+  requestReplyMsg_.type = (OpType)requestReplyContext_.type;
+  requestReplyMsg_.success = requestReplyContext_.success;
+  requestReplyMsg_.rid = requestReplyContext_.rid;
+  requestReplyMsg_.address = requestReplyContext_.address;
+  requestReplyMsg_.size = requestReplyContext_.size;
+  requestReplyMsg_.key = requestReplyContext_.key;
+  auto msg_size = sizeof(requestReplyMsg_);
+  size_ = msg_size;
+
+  /// copy data from block metadata list
+  uint32_t bml_size = 0;
+  if (!requestReplyContext_.bml.empty()) {
+    bml_size = sizeof(block_meta) * requestReplyContext_.bml.size();
+    size_ += bml_size;
+  }
+  data_ = static_cast<char *>(std::malloc(size_));
+  memcpy(data_, &requestReplyMsg_, msg_size);
+  if (bml_size != 0) {
+    memcpy(data_ + msg_size, &requestReplyContext_.bml[0], bml_size);
+  }
+}
+
+void RequestReply::decode() {
+  memcpy(&requestReplyMsg_, data_, size_);
+  requestReplyContext_.type = (OpType)requestReplyMsg_.type;
+  requestReplyContext_.success = requestReplyMsg_.success;
+  requestReplyContext_.rid = requestReplyMsg_.rid;
+  requestReplyContext_.address = requestReplyMsg_.address;
+  requestReplyContext_.size = requestReplyMsg_.size;
+  requestReplyContext_.key = requestReplyMsg_.key;
+  if (size_ > sizeof(requestReplyMsg_)) {
+    auto bml_size = size_ - sizeof(requestReplyMsg_);
+    requestReplyContext_.bml.resize(bml_size / sizeof(block_meta));
+    memcpy(&requestReplyContext_.bml[0], data_ + sizeof(requestReplyMsg_),
+           bml_size);
+  }
+}
diff --git a/rpmp/pmpool/Event.h b/rpmp/pmpool/Event.h
new file mode 100644
index 00000000..136b73f3
--- /dev/null
+++ b/rpmp/pmpool/Event.h
@@ -0,0 +1,139 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Request.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Friday, December 13th 2019, 3:43:30 pm
+ * Author: root
+ *
+ * Copyright (c) Intel
+ */
+
+#ifndef PMPOOL_EVENT_H_
+#define PMPOOL_EVENT_H_
+
+#include <HPNL/ChunkMgr.h>
+#include <HPNL/Connection.h>
+
+#include <future>  // NOLINT
+#include <vector>
+
+#include "pmpool/Base.h"
+#include "pmpool/PmemAllocator.h"
+
+using std::future;
+using std::promise;
+using std::vector;
+
+class RequestHandler;
+class ClientRecvCallback;
+class Protocol;
+
+enum OpType : uint32_t {
+  ALLOC = 1,
+  FREE,
+  PREPARE,
+  WRITE,
+  READ,
+  PUT,
+  GET,
+  GET_META,
+  DELETE,
+  REPLY = 1 << 16,
+  ALLOC_REPLY,
+  FREE_REPLY,
+  PREPARE_REPLY,
+  WRITE_REPLY,
+  READ_REPLY,
+  PUT_REPLY,
+  GET_REPLY,
+  GET_META_REPLY,
+  DELETE_REPLY
+};
+
+/**
+ * @brief Define two types of event in this file: Request, RequestReply
+ * Request: a event that client creates and sends to server.
+ * RequestReply: a event that server creates and sends to client.
+ * RequestContext and RequestReplyContext include the context information of the
+ * previous two events.
+ */
+struct RequestReplyContext {
+  OpType type;
+  uint32_t success;
+  uint64_t rid;
+  uint64_t address;
+  uint64_t src_address;
+  uint64_t dest_address;
+  uint64_t src_rkey;
+  uint64_t size;
+  uint64_t key;
+  Connection* con;
+  Chunk* ck;
+  vector <block_meta> bml;
+};
+
+template <class T>
+inline void encode_(T* t, char* data, uint64_t* size) {
+  assert(t != nullptr);
+  memcpy(data, t, sizeof(t));
+  *size = sizeof(t);
+}
+
+template <class T>
+inline void decode_(T* t, char* data, uint64_t size) {
+  assert(t != nullptr);
+  assert(size == sizeof(t));
+  memcpy(t, data, size);
+}
+
+class RequestReply {
+ public:
+  RequestReply() = delete;
+  explicit RequestReply(RequestReplyContext requestReplyContext);
+  RequestReply(char* data, uint64_t size, Connection* con);
+  ~RequestReply();
+  RequestReplyContext& get_rrc();
+  void decode();
+  void encode();
+
+ private:
+  friend Protocol;
+  char* data_;
+  uint64_t size_;
+  RequestReplyMsg requestReplyMsg_;
+  RequestReplyContext requestReplyContext_;
+};
+
+typedef promise<RequestReplyContext> Promise;
+typedef future<RequestReplyContext> Future;
+
+struct RequestContext {
+  OpType type;
+  uint64_t rid;
+  uint64_t address;
+  uint64_t src_address;
+  uint64_t src_rkey;
+  uint64_t size;
+  uint64_t key;
+  Connection* con;
+};
+
+class Request {
+ public:
+  Request() = delete;
+  explicit Request(RequestContext requestContext);
+  Request(char* data, uint64_t size, Connection* con);
+  ~Request();
+  RequestContext& get_rc();
+  void encode();
+  void decode();
+
+ private:
+  friend RequestHandler;
+  friend ClientRecvCallback;
+  char* data_;
+  uint64_t size_;
+  RequestMsg requestMsg_;
+  RequestContext requestContext_;
+};
+
+#endif  // PMPOOL_EVENT_H_
diff --git a/rpmp/pmpool/Log.h b/rpmp/pmpool/Log.h
new file mode 100644
index 00000000..e396e188
--- /dev/null
+++ b/rpmp/pmpool/Log.h
@@ -0,0 +1,49 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Log.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Friday, February 28th 2020, 2:37:41 pm
+ * Author: root
+ *
+ * Copyright (c) 2020 Intel
+ */
+
+#ifndef PMPOOL_LOG_H_
+#define PMPOOL_LOG_H_
+
+#include <memory>
+
+#include "Config.h"
+#include "spdlog/spdlog.h"
+
+class Log {
+ public:
+  explicit Log(Config *config) : config_(config) {
+    file_log_ = spdlog::basic_logger_mt("file_logger", config_->get_log_path());
+    if (config_->get_log_level() == "debug") {
+      file_log_->set_level(spdlog::level::debug);
+      file_log_->flush_on(spdlog::level::debug);
+    } else if (config_->get_log_level() == "info") {
+      file_log_->set_level(spdlog::level::info);
+      file_log_->flush_on(spdlog::level::info);
+    } else if (config_->get_log_level() == "warn") {
+      file_log_->set_level(spdlog::level::warn);
+      file_log_->flush_on(spdlog::level::warn);
+    } else if (config_->get_log_level() == "error") {
+      file_log_->set_level(spdlog::level::err);
+      file_log_->flush_on(spdlog::level::err);
+    } else {
+    }
+    console_log_ = spdlog::stdout_color_mt("console");
+    console_log_->flush_on(spdlog::level::info);
+  }
+
+  std::shared_ptr<spdlog::logger> get_file_log() { return file_log_; }
+  std::shared_ptr<spdlog::logger> get_console_log() { return console_log_; }
+
+ private:
+  Config *config_;
+  std::shared_ptr<spdlog::logger> file_log_;
+  std::shared_ptr<spdlog::logger> console_log_;
+};
+
+#endif  //  PMPOOL_LOG_H_
diff --git a/rpmp/pmpool/NetworkServer.cc b/rpmp/pmpool/NetworkServer.cc
new file mode 100644
index 00000000..2f731a79
--- /dev/null
+++ b/rpmp/pmpool/NetworkServer.cc
@@ -0,0 +1,134 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/NetworkServer.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Tuesday, December 24th 2019, 7:29:48 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#include "pmpool/NetworkServer.h"
+
+#include "Base.h"
+#include "Config.h"
+#include "Event.h"
+#include "Log.h"
+#include "buffer/CircularBuffer.h"
+
+NetworkServer::NetworkServer(Config *config, Log *log)
+    : config_(config), log_(log) {
+  time = 0;
+}
+
+NetworkServer::~NetworkServer() {
+  for (int i = 0; i < buffer_id_; i++) {
+    unregister_rma_buffer(i);
+  }
+}
+
+int NetworkServer::init() {
+  server_ = std::make_shared<Server>(config_->get_network_worker_num(),
+                                     config_->get_network_buffer_num());
+  CHK_ERR("hpnl server init", server_->init());
+
+  chunkMgr_ = std::make_shared<ChunkPool>(server_.get(),
+                                          config_->get_network_buffer_size(),
+                                          config_->get_network_buffer_num());
+
+  server_->set_chunk_mgr(chunkMgr_.get());
+  return 0;
+}
+
+int NetworkServer::start() {
+  server_->start();
+  CHK_ERR("hpnl server listen", server_->listen(config_->get_ip().c_str(),
+                                                config_->get_port().c_str()));
+
+  circularBuffer_ =
+      std::make_shared<CircularBuffer>(1024 * 1024, 4096, true, this);
+  return 0;
+}
+
+void NetworkServer::wait() { server_->wait(); }
+
+Chunk *NetworkServer::register_rma_buffer(char *rma_buffer, uint64_t size) {
+  return server_->reg_rma_buffer(rma_buffer, size, buffer_id_++);
+}
+
+void NetworkServer::unregister_rma_buffer(int buffer_id) {
+  server_->unreg_rma_buffer(buffer_id);
+}
+
+void NetworkServer::get_dram_buffer(RequestReplyContext *rrc) {
+  char *buffer = circularBuffer_->get(rrc->size);
+  rrc->dest_address = (uint64_t)buffer;
+
+  Chunk *base_ck = circularBuffer_->get_rma_chunk();
+  uint64_t offset = circularBuffer_->get_offset(rrc->dest_address);
+
+  // encapsulate new chunk
+  Chunk *ck = new Chunk();
+  ck->buffer = static_cast<char *>(base_ck->buffer) + offset;
+  ck->capacity = base_ck->capacity;
+  ck->buffer_id = buffer_id_++;
+  ck->mr = base_ck->mr;
+  ck->size = rrc->size;
+  rrc->ck = ck;
+}
+
+void NetworkServer::reclaim_dram_buffer(RequestReplyContext *rrc) {
+  char *buffer_tmp = reinterpret_cast<char *>(rrc->dest_address);
+  circularBuffer_->put(buffer_tmp, rrc->size);
+  delete rrc->ck;
+}
+
+void NetworkServer::get_pmem_buffer(RequestReplyContext *rrc, Chunk *base_ck) {
+  Chunk *ck = new Chunk();
+  ck->buffer = reinterpret_cast<char *>(rrc->dest_address);
+  ck->capacity = rrc->size;
+  ck->buffer_id = buffer_id_++;
+  ck->mr = base_ck->mr;
+  ck->size = rrc->size;
+  rrc->ck = ck;
+}
+
+void NetworkServer::reclaim_pmem_buffer(RequestReplyContext *rrc) {
+  if (rrc->ck != nullptr) {
+    delete rrc->ck;
+  }
+}
+
+ChunkMgr *NetworkServer::get_chunk_mgr() { return chunkMgr_.get(); }
+
+void NetworkServer::set_recv_callback(Callback *callback) {
+  server_->set_recv_callback(callback);
+}
+
+void NetworkServer::set_send_callback(Callback *callback) {
+  server_->set_send_callback(callback);
+}
+
+void NetworkServer::set_read_callback(Callback *callback) {
+  server_->set_read_callback(callback);
+}
+
+void NetworkServer::set_write_callback(Callback *callback) {
+  server_->set_write_callback(callback);
+}
+
+void NetworkServer::send(char *data, uint64_t size, Connection *con) {
+  auto ck = chunkMgr_->get(con);
+  std::memcpy(reinterpret_cast<char *>(ck->buffer), data, size);
+  ck->size = size;
+  con->send(ck);
+}
+
+void NetworkServer::read(RequestReply *rr) {
+  RequestReplyContext rrc = rr->get_rrc();
+  rrc.con->read(rrc.ck, 0, rrc.size, rrc.src_address, rrc.src_rkey);
+}
+
+void NetworkServer::write(RequestReply *rr) {
+  RequestReplyContext rrc = rr->get_rrc();
+  rrc.con->write(rrc.ck, 0, rrc.size, rrc.src_address, rrc.src_rkey);
+}
diff --git a/rpmp/pmpool/NetworkServer.h b/rpmp/pmpool/NetworkServer.h
new file mode 100644
index 00000000..d125a83c
--- /dev/null
+++ b/rpmp/pmpool/NetworkServer.h
@@ -0,0 +1,85 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/NetworkServer.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Tuesday, December 10th 2019, 3:14:59 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#ifndef PMPOOL_NETWORKSERVER_H_
+#define PMPOOL_NETWORKSERVER_H_
+
+#include <HPNL/ChunkMgr.h>
+#include <HPNL/Connection.h>
+#include <HPNL/Server.h>
+
+#include <atomic>
+#include <memory>
+
+#include "RmaBufferRegister.h"
+
+class CircularBuffer;
+class Config;
+class RequestReply;
+class RequestReplyContext;
+class Log;
+
+/**
+ * @brief RPMP network service is based on HPNL, which is a completely
+ * asynchronous network library. RPMP currently supports RDMA iWarp and RoCE V2
+ * protocol.
+ */
+class NetworkServer : public RmaBufferRegister {
+ public:
+  NetworkServer() = delete;
+  NetworkServer(Config *config, Log *log_);
+  ~NetworkServer();
+  int init();
+  int start();
+  void wait();
+  /// register DRAM or Persistent Memory as RDMA region.
+  /// Return chunk that is the wrapper of RDMA region if succeed,
+  /// return nullptr if fail.
+  Chunk *register_rma_buffer(char *rma_buffer, uint64_t size) override;
+
+  /// unregister RDMA region for given buffer.
+  void unregister_rma_buffer(int buffer_id) override;
+
+  /// get DRAM buffer from circular buffer pool.
+  void get_dram_buffer(RequestReplyContext *rrc);
+
+  /// reclaim DRAM buffer from circular buffer pool.
+  void reclaim_dram_buffer(RequestReplyContext *rrc);
+
+  /// get Persistent Memory buffer from circular buffer pool
+  void get_pmem_buffer(RequestReplyContext *rrc, Chunk *ck);
+
+  /// reclaim Persistent Memory buffer form circular buffer pool
+  void reclaim_pmem_buffer(RequestReplyContext *rrc);
+
+  /// return the pointer of chunk manager.
+  ChunkMgr *get_chunk_mgr();
+
+  /// since the network implementation is asynchronous,
+  /// we need to define callback better before starting network service.
+  void set_recv_callback(Callback *callback);
+  void set_send_callback(Callback *callback);
+  void set_read_callback(Callback *callback);
+  void set_write_callback(Callback *callback);
+
+  void send(char *data, uint64_t size, Connection *con);
+  void read(RequestReply *rrc);
+  void write(RequestReply *rrc);
+
+ private:
+  Config *config_;
+  Log* log_;
+  std::shared_ptr<Server> server_;
+  std::shared_ptr<ChunkMgr> chunkMgr_;
+  std::shared_ptr<CircularBuffer> circularBuffer_;
+  std::atomic<uint64_t> buffer_id_{0};
+  uint64_t time;
+};
+
+#endif  // PMPOOL_NETWORKSERVER_H_
diff --git a/rpmp/pmpool/PmemAllocator.h b/rpmp/pmpool/PmemAllocator.h
new file mode 100644
index 00000000..b6d279ce
--- /dev/null
+++ b/rpmp/pmpool/PmemAllocator.h
@@ -0,0 +1,392 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/PmemObjAllocator.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Monday, December 9th 2019, 10:52:02 am
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#ifndef PMPOOL_PMEMALLOCATOR_H_
+#define PMPOOL_PMEMALLOCATOR_H_
+
+#include <libpmemobj.h>
+
+#include <atomic>
+#include <chrono>  // NOLINT
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "Allocator.h"
+#include "DataServer.h"
+#include "Log.h"
+#include "NetworkServer.h"
+
+using std::shared_ptr;
+using std::unordered_map;
+
+#define PMEMOBJ_ALLOCATOR_LAYOUT_NAME "pmemobj_allocator_layout"
+
+// block header stored in pmem
+struct block_hdr {
+  PMEMoid next;
+  PMEMoid pre;
+  uint64_t addr;
+  uint64_t size;
+};
+
+// block data entry stored in pmem
+struct block_entry {
+  struct block_hdr hdr;
+  PMEMoid data;
+};
+
+// pmem root entry
+struct Base {
+  PMEMoid head;
+  PMEMoid tail;
+  PMEMrwlock rwlock;
+  uint64_t bytes_written;
+};
+
+struct PmemContext {
+  PMEMobjpool *pop;
+  PMEMoid poid;
+  Base *base;
+};
+
+// pmem data allocation types
+enum types { BLOCK_ENTRY_TYPE, DATA_TYPE, MAX_TYPE };
+
+/**
+ * @brief libpmemobj based implementation of Allocator interface.
+ *
+ */
+class PmemObjAllocator : public Allocator {
+ public:
+  PmemObjAllocator() = delete;
+  explicit PmemObjAllocator(Log *log, DiskInfo *diskInfos,
+                            NetworkServer *server, int wid)
+      : log_(log), diskInfo_(diskInfos), server_(server), wid_(wid) {}
+  ~PmemObjAllocator() { close(); }
+
+  int init() override {
+    memset(str, '0', 1048576);
+    if (create()) {
+      int res = open();
+      if (res) {
+        string err_msg = pmemobj_errormsg();
+        log_->get_file_log()->error("failed to open pmem pool, errmsg: " +
+                                    err_msg);
+      }
+    }
+    return 0;
+  }
+
+  uint64_t allocate_and_write(uint64_t size,
+                              const char *content = nullptr) override {
+    jmp_buf env;
+    if (setjmp(env)) {
+      // end the transaction
+      (void)pmemobj_tx_end();
+      return -1;
+    }
+
+    // begin a transaction, also acquiring the write lock for the data
+    if (pmemobj_tx_begin(pmemContext_.pop, env, TX_PARAM_RWLOCK,
+                         &pmemContext_.base->rwlock, TX_PARAM_NONE)) {
+      perror("pmemobj_tx_begin failed in pmemkv put");
+      return -1;
+    }
+
+    // allocate the new node to be inserted
+    PMEMoid beo =
+        pmemobj_tx_alloc(sizeof(struct block_entry), BLOCK_ENTRY_TYPE);
+    if (beo.off == 0 && beo.pool_uuid_lo == 0) {
+      (void)pmemobj_tx_end();
+      perror("pmemobj_tx_alloc failed in pmemkv put");
+      return -1;
+    }
+    struct block_entry *bep = (struct block_entry *)pmemobj_direct(beo);
+    bep->data = pmemobj_tx_zalloc(size, DATA_TYPE);
+    bep->hdr.next = OID_NULL;
+    bep->hdr.addr = TO_GLOB((uint64_t)pmemobj_direct(bep->data),
+                            (uint64_t)pmemContext_.pop, wid_);
+    bep->hdr.size = size;
+
+    uint64_t start =
+        std::chrono::high_resolution_clock::now().time_since_epoch() /
+        std::chrono::milliseconds(1);
+    char *pmem_data = static_cast<char *>(pmemobj_direct(bep->data));
+    if (content != nullptr) {
+      memcpy(pmem_data, content, size);
+    }
+    uint64_t end =
+        std::chrono::high_resolution_clock::now().time_since_epoch() /
+        std::chrono::milliseconds(1);
+    total += (end - start);
+    // std::cout << "index " << wid_ << ", total is " << total / 1000.0
+    //           << std::endl;
+
+    // add the modified root object to the undo data
+    pmemobj_tx_add_range(pmemContext_.poid, 0, sizeof(struct Base));
+    if (pmemContext_.base->tail.off == 0) {
+      // update head
+      pmemContext_.base->head = beo;
+      bep->hdr.pre = OID_NULL;
+    } else {
+      // add the modified tail entry to the undo data
+      bep->hdr.pre = pmemContext_.base->tail;
+      pmemobj_tx_add_range(pmemContext_.base->tail, 0,
+                           sizeof(struct block_entry));
+      ((struct block_entry *)pmemobj_direct(pmemContext_.base->tail))
+          ->hdr.next = beo;
+    }
+
+    pmemContext_.base->tail = beo;  // update tail
+    pmemContext_.base->bytes_written += size;
+    pmemobj_tx_commit();
+    (void)pmemobj_tx_end();
+
+    // update in-memory index
+    if (update_meta(beo)) {
+      return -1;
+    }
+
+    return bep->hdr.addr;
+  }
+
+  int write(uint64_t address, const char *content, uint64_t size) override {
+    std::unique_lock<std::mutex> l(mtx);
+    if (!index_map.count(address)) {
+      return -1;
+    }
+    PMEMoid data = index_map[address];
+    struct block_entry *bep = (struct block_entry *)pmemobj_direct(data);
+    char *pmem_data = static_cast<char *>(pmemobj_direct(bep->data));
+    // pmemobj_memcpy_persist(pmemContext_.pop, pmem_data, content, size);
+    uint64_t start =
+        std::chrono::high_resolution_clock::now().time_since_epoch() /
+        std::chrono::milliseconds(1);
+    memcpy(pmem_data, content, size);
+    uint64_t end =
+        std::chrono::high_resolution_clock::now().time_since_epoch() /
+        std::chrono::milliseconds(1);
+    total += (end - start);
+    l.unlock();
+    return 0;
+  }
+
+  uint64_t get_virtual_address(uint64_t address) {
+    std::unique_lock<std::mutex> l(mtx);
+    if (!index_map.count(address)) {
+      return -1;
+    }
+    PMEMoid data = index_map[address];
+    struct block_entry *bep = (struct block_entry *)pmemobj_direct(data);
+    char *pmem_data = static_cast<char *>(pmemobj_direct(bep->data));
+    l.unlock();
+    return (uint64_t)pmem_data;
+  }
+
+  int release(uint64_t address) override {
+    jmp_buf env;
+    if (setjmp(env)) {
+      // end the transaction
+      (void)pmemobj_tx_end();
+      return -1;
+    }
+
+    // begin a transaction, also acquiring the write lock for the data
+    if (pmemobj_tx_begin(pmemContext_.pop, env, TX_PARAM_RWLOCK,
+                         &pmemContext_.base->rwlock, TX_PARAM_NONE)) {
+      perror("pmemobj_tx_begin failed in pmemkv put");
+      return -1;
+    }
+    if (!index_map.count(address)) {
+      (void)pmemobj_tx_end();
+      perror("address not found");
+      return -1;
+    }
+    PMEMoid data = index_map[address];
+    struct block_entry *bep = (struct block_entry *)pmemobj_direct(data);
+    struct block_entry *prev_bep =
+        (struct block_entry *)pmemobj_direct(bep->hdr.pre);
+    struct block_entry *next_bep =
+        (struct block_entry *)pmemobj_direct(bep->hdr.next);
+    pmemobj_tx_add_range(pmemContext_.poid, 0, sizeof(struct Base));
+    if (prev_bep == nullptr) {
+      if (next_bep == nullptr) {
+        pmemContext_.base->head = OID_NULL;
+        pmemContext_.base->tail = OID_NULL;
+      } else {
+        pmemContext_.base->head = bep->hdr.next;
+        next_bep->hdr.pre = OID_NULL;
+      }
+    } else {
+      pmemobj_tx_add_range(bep->hdr.pre, 0, sizeof(struct Base));
+      prev_bep->hdr.next = bep->hdr.next;
+    }
+    pmemContext_.base->bytes_written -= bep->hdr.size;
+    pmemobj_tx_add_range(data, 0, sizeof(struct Base));
+    bep->hdr.pre = OID_NULL;
+    bep->hdr.next = OID_NULL;
+    pmemobj_free(&data);
+    pmemobj_free(&bep->data);
+
+    pmemobj_tx_commit();
+    (void)pmemobj_tx_end();
+
+    return 0;
+  }
+
+  int release_all() override {
+    PMEMoid cur_oid = pmemContext_.base->head;
+    while (cur_oid.off != 0 && cur_oid.pool_uuid_lo != 0) {
+      struct block_entry *cur_bep =
+          (struct block_entry *)pmemobj_direct(cur_oid);
+      PMEMoid next_oid = cur_bep->hdr.next;
+      struct block_entry *next_bep =
+          (struct block_entry *)pmemobj_direct(next_oid);
+      pmemobj_free(&cur_oid);
+      pmemobj_free(&cur_bep->data);
+      cur_oid = next_oid;
+    }
+    pmemContext_.base->head = OID_NULL;
+    pmemContext_.base->tail = OID_NULL;
+    pmemContext_.base->bytes_written = 0;
+
+    return 0;
+  }
+
+  int dump_all() override {
+    std::cout << "******************worker " << wid_
+              << " start dump*********************" << std::endl;
+    if (pmemobj_rwlock_rdlock(pmemContext_.pop, &pmemContext_.base->rwlock) !=
+        0) {
+      return -1;
+    }
+    struct block_entry *next_bep =
+        (struct block_entry *)pmemobj_direct(pmemContext_.base->head);
+    uint64_t read_offset = 0;
+    while (next_bep != nullptr) {
+      char *pmem_data =
+          reinterpret_cast<char *>(pmemobj_direct(next_bep->data));
+      char *tmp = reinterpret_cast<char *>(std::malloc(next_bep->hdr.size));
+      memcpy(tmp, pmem_data, next_bep->hdr.size);
+      std::cout << "dump address " << next_bep->hdr.addr << std::endl;
+      read_offset += next_bep->hdr.size;
+      std::free(tmp);
+      next_bep = (struct block_entry *)pmemobj_direct(next_bep->hdr.next);
+    }
+    pmemobj_rwlock_unlock(pmemContext_.pop, &pmemContext_.base->rwlock);
+    std::cout << "total size " << pmemContext_.base->bytes_written << std::endl;
+    std::cout << "******************worker " << wid_
+              << " end dump*********************" << std::endl;
+    return 0;
+  }
+
+  Chunk *get_rma_chunk() { return base_ck; }
+
+ private:
+  int create() {
+    // debug setting
+    int sds_write_value = 0;
+    pmemobj_ctl_set(nullptr, "sds.at_create", &sds_write_value);
+
+    pmemContext_.pop = pmemobj_create(diskInfo_->path.c_str(),
+                                      PMEMOBJ_ALLOCATOR_LAYOUT_NAME, 0, 0666);
+    if (pmemContext_.pop == nullptr) {
+      string err_msg = pmemobj_errormsg();
+      log_->get_file_log()->warn("failed to create pmem pool, errmsg: " +
+                                 err_msg);
+      return -1;
+    }
+    pmemContext_.poid = pmemobj_root(pmemContext_.pop, sizeof(struct Base));
+    pmemContext_.base = (struct Base *)pmemobj_direct(pmemContext_.poid);
+    pmemContext_.base->head = OID_NULL;
+    pmemContext_.base->tail = OID_NULL;
+    pmemContext_.base->bytes_written = 0;
+
+    if (server_) {
+      base_ck = server_->register_rma_buffer(
+          reinterpret_cast<char *>(pmemContext_.pop), diskInfo_->size);
+      assert(base_ck != nullptr);
+      log_->get_console_log()->info(
+          "successfully registered Persistent Memory(" + diskInfo_->path +
+          ") as RDMA region");
+    }
+    return 0;
+  }
+
+  int open() {
+    // debug setting
+    int sds_write_value = 0;
+    pmemobj_ctl_set(nullptr, "sds.at_create", &sds_write_value);
+
+    pmemContext_.pop =
+        pmemobj_open(diskInfo_->path.c_str(), PMEMOBJ_ALLOCATOR_LAYOUT_NAME);
+    if (pmemContext_.pop == nullptr) {
+      return -1;
+    }
+
+    if (server_) {
+      base_ck = server_->register_rma_buffer(
+          reinterpret_cast<char *>(pmemContext_.pop), diskInfo_->size);
+      assert(base_ck != nullptr);
+      log_->get_console_log()->info(
+          "successfully registered Persistent Memory(" + diskInfo_->path +
+          ") as RDMA region");
+    }
+
+    pmemContext_.poid = pmemobj_root(pmemContext_.pop, sizeof(struct Base));
+    pmemContext_.base = (struct Base *)pmemobj_direct(pmemContext_.poid);
+    PMEMoid next = pmemContext_.base->head;
+    while (next.off != 0 && next.pool_uuid_lo != 0) {
+      if (update_meta(next)) {
+        return -1;
+      }
+      struct block_entry *bep = (struct block_entry *)pmemobj_direct(next);
+      next = bep->hdr.next;
+    }
+    return 0;
+  }
+
+  void close() {
+    pmemobj_close(pmemContext_.pop);
+    free_meta();
+  }
+
+  int update_meta(const PMEMoid &oid) {
+    std::lock_guard<std::mutex> l(mtx);
+    struct block_entry *bep = (struct block_entry *)pmemobj_direct(oid);
+    if (!index_map.count(bep->hdr.addr)) {
+      index_map[bep->hdr.addr] = oid;
+    } else {
+      assert("invalide operation.");
+    }
+    return 0;
+  }
+
+  int free_meta() {
+    std::lock_guard<std::mutex> l(mtx);
+    index_map.clear();
+  }
+
+ private:
+  Log *log_;
+  DiskInfo *diskInfo_;
+  NetworkServer *server_;
+  int wid_;
+  PmemContext pmemContext_;
+  std::mutex mtx;
+  unordered_map<uint64_t, PMEMoid> index_map;
+  uint64_t total = 0;
+  char str[1048576];
+  Chunk *base_ck;
+};
+
+#endif  // PMPOOL_PMEMALLOCATOR_H_
diff --git a/rpmp/pmpool/Protocol.cc b/rpmp/pmpool/Protocol.cc
new file mode 100644
index 00000000..cfa88f81
--- /dev/null
+++ b/rpmp/pmpool/Protocol.cc
@@ -0,0 +1,378 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Protocol.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Thursday, November 7th 2019, 3:48:52 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#include "pmpool/Protocol.h"
+
+#include <assert.h>
+
+#include "AllocatorProxy.h"
+#include "Config.h"
+#include "Digest.h"
+#include "Event.h"
+#include "Log.h"
+#include "NetworkServer.h"
+
+RecvCallback::RecvCallback(Protocol *protocol, ChunkMgr *chunkMgr)
+    : protocol_(protocol), chunkMgr_(chunkMgr) {}
+
+void RecvCallback::operator()(void *buffer_id, void *buffer_size) {
+  auto buffer_id_ = *static_cast<int *>(buffer_id);
+  Chunk *ck = chunkMgr_->get(buffer_id_);
+  assert(*static_cast<uint64_t *>(buffer_size) == ck->size);
+  Request *request = new Request(reinterpret_cast<char *>(ck->buffer), ck->size,
+                                 reinterpret_cast<Connection *>(ck->con));
+  request->decode();
+  protocol_->enqueue_recv_msg(request);
+  chunkMgr_->reclaim(ck, static_cast<Connection *>(ck->con));
+}
+
+ReadCallback::ReadCallback(Protocol *protocol) : protocol_(protocol) {}
+
+void ReadCallback::operator()(void *buffer_id, void *buffer_size) {
+  auto buffer_id_ = *static_cast<int *>(buffer_id);
+  protocol_->enqueue_rma_msg(buffer_id_);
+}
+
+SendCallback::SendCallback(ChunkMgr *chunkMgr) : chunkMgr_(chunkMgr) {}
+
+void SendCallback::operator()(void *buffer_id, void *buffer_size) {
+  auto buffer_id_ = *static_cast<int *>(buffer_id);
+  auto ck = chunkMgr_->get(buffer_id_);
+
+  /// free the memory of class RequestReply
+  auto reqeustReply = static_cast<RequestReply *>(ck->ptr);
+  delete reqeustReply;
+
+  chunkMgr_->reclaim(ck, static_cast<Connection *>(ck->con));
+}
+
+WriteCallback::WriteCallback(Protocol *protocol) : protocol_(protocol) {}
+
+void WriteCallback::operator()(void *buffer_id, void *buffer_size) {
+  auto buffer_id_ = *static_cast<int *>(buffer_id);
+  protocol_->enqueue_rma_msg(buffer_id_);
+}
+
+RecvWorker::RecvWorker(Protocol *protocol, int index)
+    : protocol_(protocol), index_(index) {
+  init = false;
+}
+
+int RecvWorker::entry() {
+  if (!init) {
+    set_affinity(index_);
+    init = true;
+  }
+  Request *request;
+  bool res = pendingRecvRequestQueue_.wait_dequeue_timed(
+      request, std::chrono::milliseconds(1000));
+  if (res) {
+    protocol_->handle_recv_msg(request);
+  }
+  return 0;
+}
+
+void RecvWorker::abort() {}
+
+void RecvWorker::addTask(Request *request) {
+  pendingRecvRequestQueue_.enqueue(request);
+}
+
+ReadWorker::ReadWorker(Protocol *protocol, int index)
+    : protocol_(protocol), index_(index) {
+  init = false;
+}
+
+int ReadWorker::entry() {
+  if (!init) {
+    set_affinity(index_);
+    init = true;
+  }
+  RequestReply *requestReply;
+  bool res = pendingReadRequestQueue_.wait_dequeue_timed(
+      requestReply, std::chrono::milliseconds(1000));
+  if (res) {
+    protocol_->handle_rma_msg(requestReply);
+  }
+  return 0;
+}
+
+void ReadWorker::abort() {}
+
+void ReadWorker::addTask(RequestReply *rr) {
+  pendingReadRequestQueue_.enqueue(rr);
+}
+
+FinalizeWorker::FinalizeWorker(Protocol *protocol) : protocol_(protocol) {}
+
+int FinalizeWorker::entry() {
+  RequestReply *requestReply;
+  bool res = pendingRequestReplyQueue_.wait_dequeue_timed(
+      requestReply, std::chrono::milliseconds(1000));
+  if (res) {
+    protocol_->handle_finalize_msg(requestReply);
+  }
+  return 0;
+}
+
+void FinalizeWorker::abort() {}
+
+void FinalizeWorker::addTask(RequestReply *requestReply) {
+  pendingRequestReplyQueue_.enqueue(requestReply);
+}
+
+Protocol::Protocol(Config *config, Log *log, NetworkServer *server,
+                   AllocatorProxy *allocatorProxy)
+    : config_(config),
+      log_(log),
+      networkServer_(server),
+      allocatorProxy_(allocatorProxy) {
+  time = 0;
+}
+
+Protocol::~Protocol() {
+  for (auto worker : recvWorkers_) {
+    worker->stop();
+    worker->join();
+  }
+  for (auto worker : readWorkers_) {
+    worker->stop();
+    worker->join();
+  }
+  finalizeWorker_->stop();
+  finalizeWorker_->join();
+}
+
+int Protocol::init() {
+  recvCallback_ =
+      std::make_shared<RecvCallback>(this, networkServer_->get_chunk_mgr());
+  sendCallback_ =
+      std::make_shared<SendCallback>(networkServer_->get_chunk_mgr());
+  readCallback_ = std::make_shared<ReadCallback>(this);
+  writeCallback_ = std::make_shared<WriteCallback>(this);
+
+  for (int i = 0; i < config_->get_pool_size(); i++) {
+    auto recvWorker = new RecvWorker(this, config_->get_affinities_()[i] - 1);
+    recvWorker->start();
+    recvWorkers_.push_back(std::shared_ptr<RecvWorker>(recvWorker));
+  }
+
+  finalizeWorker_ = make_shared<FinalizeWorker>(this);
+  finalizeWorker_->start();
+
+  for (int i = 0; i < config_->get_pool_size(); i++) {
+    auto readWorker = new ReadWorker(this, config_->get_affinities_()[i]);
+    readWorker->start();
+    readWorkers_.push_back(std::shared_ptr<ReadWorker>(readWorker));
+  }
+
+  networkServer_->set_recv_callback(recvCallback_.get());
+  networkServer_->set_send_callback(sendCallback_.get());
+  networkServer_->set_read_callback(readCallback_.get());
+  networkServer_->set_write_callback(writeCallback_.get());
+  return 0;
+}
+
+void Protocol::enqueue_recv_msg(Request *request) {
+  RequestContext rc = request->get_rc();
+  if (rc.address != 0) {
+    auto wid = GET_WID(rc.address);
+    recvWorkers_[wid]->addTask(request);
+  } else {
+    recvWorkers_[rc.rid % config_->get_pool_size()]->addTask(request);
+  }
+}
+
+void Protocol::handle_recv_msg(Request *request) {
+  RequestContext rc = request->get_rc();
+  RequestReplyContext rrc;
+  switch (rc.type) {
+    case ALLOC: {
+      uint64_t addr = allocatorProxy_->allocate_and_write(
+          rc.size, nullptr, rc.rid % config_->get_pool_size());
+      auto wid = GET_WID(addr);
+      assert(wid == rc.rid % config_->get_pool_size());
+      rrc.type = ALLOC_REPLY;
+      rrc.success = 0;
+      rrc.rid = rc.rid;
+      rrc.address = addr;
+      rrc.size = rc.size;
+      rrc.con = rc.con;
+      RequestReply *requestReply = new RequestReply(rrc);
+      rrc.ck->ptr = requestReply;
+      enqueue_finalize_msg(requestReply);
+      break;
+    }
+    case FREE: {
+      rrc.type = FREE_REPLY;
+      rrc.success = allocatorProxy_->release(rc.address);
+      rrc.rid = rc.rid;
+      rrc.address = rc.address;
+      rrc.size = rc.size;
+      rrc.con = rc.con;
+      RequestReply *requestReply = new RequestReply(rrc);
+      rrc.ck->ptr = requestReply;
+      enqueue_finalize_msg(requestReply);
+      break;
+    }
+    case WRITE: {
+      rrc.type = WRITE_REPLY;
+      rrc.success = 0;
+      rrc.rid = rc.rid;
+      rrc.address = rc.address;
+      rrc.src_address = rc.src_address;
+      rrc.src_rkey = rc.src_rkey;
+      rrc.size = rc.size;
+      rrc.con = rc.con;
+      networkServer_->get_dram_buffer(&rrc);
+      RequestReply *requestReply = new RequestReply(rrc);
+      rrc.ck->ptr = requestReply;
+
+      std::unique_lock<std::mutex> lk(rrcMtx_);
+      rrcMap_[rrc.ck->buffer_id] = requestReply;
+      lk.unlock();
+      networkServer_->read(requestReply);
+      break;
+    }
+    case READ: {
+      rrc.type = READ_REPLY;
+      rrc.success = 0;
+      rrc.rid = rc.rid;
+      rrc.address = rc.address;
+      rrc.src_address = rc.src_address;
+      rrc.src_rkey = rc.src_rkey;
+      rrc.size = rc.size;
+      rrc.con = rc.con;
+      rrc.dest_address = allocatorProxy_->get_virtual_address(rrc.address);
+      rrc.ck = nullptr;
+      Chunk *base_ck = allocatorProxy_->get_rma_chunk(rrc.address);
+      networkServer_->get_pmem_buffer(&rrc, base_ck);
+      RequestReply *requestReply = new RequestReply(rrc);
+      rrc.ck->ptr = requestReply;
+
+      std::unique_lock<std::mutex> lk(rrcMtx_);
+      rrcMap_[rrc.ck->buffer_id] = requestReply;
+      lk.unlock();
+      networkServer_->write(requestReply);
+      break;
+    }
+    case PUT: {
+      rrc.type = PUT_REPLY;
+      rrc.success = 0;
+      rrc.rid = rc.rid;
+      rrc.address = rc.address;
+      rrc.src_address = rc.src_address;
+      rrc.src_rkey = rc.src_rkey;
+      rrc.size = rc.size;
+      rrc.key = rc.key;
+      rrc.con = rc.con;
+      networkServer_->get_dram_buffer(&rrc);
+      RequestReply *requestReply = new RequestReply(rrc);
+      rrc.ck->ptr = requestReply;
+
+      std::unique_lock<std::mutex> lk(rrcMtx_);
+      rrcMap_[rrc.ck->buffer_id] = requestReply;
+      lk.unlock();
+      networkServer_->read(requestReply);
+      break;
+    }
+    case GET_META: {
+      rrc.type = GET_META_REPLY;
+      rrc.success = 0;
+      rrc.rid = rc.rid;
+      rrc.size = rc.size;
+      rrc.key = rc.key;
+      rrc.con = rc.con;
+      RequestReply *requestReply = new RequestReply(rrc);
+      rrc.ck->ptr = requestReply;
+      enqueue_finalize_msg(requestReply);
+    }
+    case DELETE: {
+      rrc.type = DELETE_REPLY;
+      rrc.key = rc.key;
+      rrc.con = rc.con;
+      rrc.rid = rc.rid;
+      rrc.success = 0;
+    }
+    default: { break; }
+  }
+
+  delete request;
+}
+
+void Protocol::enqueue_finalize_msg(RequestReply *requestReply) {
+  finalizeWorker_->addTask(requestReply);
+}
+
+void Protocol::handle_finalize_msg(RequestReply *requestReply) {
+  RequestReplyContext rrc = requestReply->get_rrc();
+  if (rrc.type == PUT_REPLY) {
+    allocatorProxy_->cache_chunk(rrc.key, rrc.address, rrc.size);
+  } else if (rrc.type == GET_META_REPLY) {
+    auto bml = allocatorProxy_->get_cached_chunk(rrc.key);
+    requestReply->requestReplyContext_.bml = bml;
+  } else if (rrc.type == DELETE_REPLY) {
+    auto bml = allocatorProxy_->get_cached_chunk(rrc.key);
+    for (auto bm : bml) {
+      rrc.success = allocatorProxy_->release(bm.address);
+      if (rrc.success) {
+        break;
+      }
+    }
+    allocatorProxy_->del_chunk(rrc.key);
+  } else {
+  }
+  requestReply->encode();
+  networkServer_->send(reinterpret_cast<char *>(requestReply->data_),
+                       requestReply->size_, rrc.con);
+}
+
+void Protocol::enqueue_rma_msg(uint64_t buffer_id) {
+  std::unique_lock<std::mutex> lk(rrcMtx_);
+  RequestReply *requestReply = rrcMap_[buffer_id];
+  lk.unlock();
+  RequestReplyContext rrc = requestReply->get_rrc();
+  if (rrc.address != 0) {
+    auto wid = GET_WID(rrc.address);
+    readWorkers_[wid]->addTask(requestReply);
+  } else {
+    readWorkers_[rrc.rid % config_->get_pool_size()]->addTask(requestReply);
+  }
+}
+
+void Protocol::handle_rma_msg(RequestReply *requestReply) {
+  RequestReplyContext &rrc = requestReply->get_rrc();
+  switch (rrc.type) {
+    case WRITE_REPLY: {
+      char *buffer = static_cast<char *>(rrc.ck->buffer);
+      if (rrc.address == 0) {
+        rrc.address = allocatorProxy_->allocate_and_write(
+            rrc.size, buffer, rrc.rid % config_->get_pool_size());
+      } else {
+        allocatorProxy_->write(rrc.address, buffer, rrc.size);
+      }
+      networkServer_->reclaim_dram_buffer(&rrc);
+      break;
+    }
+    case READ_REPLY: {
+      networkServer_->reclaim_pmem_buffer(&rrc);
+      break;
+    }
+    case PUT_REPLY: {
+      char *buffer = static_cast<char *>(rrc.ck->buffer);
+      assert(rrc.address == 0);
+      rrc.address = allocatorProxy_->allocate_and_write(
+          rrc.size, buffer, rrc.rid % config_->get_pool_size());
+      networkServer_->reclaim_dram_buffer(&rrc);
+      break;
+    }
+    default: { break; }
+  }
+  enqueue_finalize_msg(requestReply);
+}
diff --git a/rpmp/pmpool/Protocol.h b/rpmp/pmpool/Protocol.h
new file mode 100644
index 00000000..b2cfb679
--- /dev/null
+++ b/rpmp/pmpool/Protocol.h
@@ -0,0 +1,194 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/Protocol.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Thursday, November 7th 2019, 3:48:52 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#ifndef PMPOOL_PROTOCOL_H_
+#define PMPOOL_PROTOCOL_H_
+
+#include <HPNL/Callback.h>
+#include <HPNL/ChunkMgr.h>
+#include <HPNL/Connection.h>
+
+#include <cassert>
+#include <chrono>  // NOLINT
+#include <cstring>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <unordered_map>
+#include <vector>
+
+#include "Event.h"
+#include "ThreadWrapper.h"
+#include "queue/blockingconcurrentqueue.h"
+#include "queue/concurrentqueue.h"
+
+class Digest;
+class AllocatorProxy;
+class Protocol;
+class NetworkServer;
+class Config;
+class Log;
+
+using moodycamel::BlockingConcurrentQueue;
+using std::make_shared;
+
+struct MessageHeader {
+  MessageHeader(uint8_t msg_type, uint64_t sequence_id) {
+    msg_type_ = msg_type;
+    sequence_id_ = sequence_id;
+  }
+  uint8_t msg_type_;
+  uint64_t sequence_id_;
+  int msg_size;
+};
+
+class RecvCallback : public Callback {
+ public:
+  RecvCallback() = delete;
+  RecvCallback(Protocol *protocol, ChunkMgr *chunkMgr);
+  ~RecvCallback() override = default;
+  void operator()(void *buffer_id, void *buffer_size) override;
+
+ private:
+  Protocol *protocol_;
+  ChunkMgr *chunkMgr_;
+};
+
+class SendCallback : public Callback {
+ public:
+  SendCallback() = delete;
+  explicit SendCallback(ChunkMgr *chunkMgr);
+  ~SendCallback() override = default;
+  void operator()(void *buffer_id, void *buffer_size) override;
+
+ private:
+  ChunkMgr *chunkMgr_;
+};
+
+class ReadCallback : public Callback {
+ public:
+  ReadCallback() = delete;
+  explicit ReadCallback(Protocol *protocol);
+  ~ReadCallback() override = default;
+  void operator()(void *buffer_id, void *buffer_size) override;
+
+ private:
+  Protocol *protocol_;
+};
+
+class WriteCallback : public Callback {
+ public:
+  WriteCallback() = delete;
+  explicit WriteCallback(Protocol *protocol);
+  ~WriteCallback() override = default;
+  void operator()(void *buffer_id, void *buffer_size) override;
+
+ private:
+  Protocol *protocol_;
+};
+
+class RecvWorker : public ThreadWrapper {
+ public:
+  RecvWorker() = delete;
+  RecvWorker(Protocol *protocol, int index);
+  ~RecvWorker() override = default;
+  int entry() override;
+  void abort() override;
+  void addTask(Request *request);
+
+ private:
+  Protocol *protocol_;
+  int index_;
+  bool init;
+  BlockingConcurrentQueue<Request *> pendingRecvRequestQueue_;
+};
+
+class ReadWorker : public ThreadWrapper {
+ public:
+  ReadWorker() = delete;
+  ReadWorker(Protocol *protocol, int index);
+  ~ReadWorker() override = default;
+  int entry() override;
+  void abort() override;
+  void addTask(RequestReply *requestReply);
+
+ private:
+  Protocol *protocol_;
+  int index_;
+  bool init;
+  BlockingConcurrentQueue<RequestReply *> pendingReadRequestQueue_;
+};
+
+class FinalizeWorker : public ThreadWrapper {
+ public:
+  FinalizeWorker() = delete;
+  explicit FinalizeWorker(Protocol *protocol);
+  ~FinalizeWorker() override = default;
+  int entry() override;
+  void abort() override;
+  void addTask(RequestReply *requestReply);
+
+ private:
+  Protocol *protocol_;
+  BlockingConcurrentQueue<RequestReply *> pendingRequestReplyQueue_;
+};
+
+/**
+ * @brief Protocol connect NetworkServer and AllocatorProtocol to achieve
+ * network and storage co-design. Protocol maitains three queues: recv queue,
+ * finalize queue and rma queue. One thread per queue to handle specific event.
+ * recv queue-> to handle receive event.
+ * finalize queue-> to handle finalization event.
+ * rma queue-> to handle remote memory access event.
+ */
+class Protocol {
+ public:
+  Protocol() = delete;
+  Protocol(Config *config, Log *log, NetworkServer *server,
+           AllocatorProxy *allocatorProxy);
+  ~Protocol();
+  int init();
+
+  friend class RecvCallback;
+  friend class RecvWorker;
+
+  void enqueue_recv_msg(Request *request);
+  void handle_recv_msg(Request *request);
+
+  void enqueue_finalize_msg(RequestReply *requestReply);
+  void handle_finalize_msg(RequestReply *requestReply);
+
+  void enqueue_rma_msg(uint64_t buffer_id);
+  void handle_rma_msg(RequestReply *requestReply);
+
+ public:
+  Config *config_;
+  Log *log_;
+
+ private:
+  NetworkServer *networkServer_;
+  AllocatorProxy *allocatorProxy_;
+
+  std::shared_ptr<RecvCallback> recvCallback_;
+  std::shared_ptr<SendCallback> sendCallback_;
+  std::shared_ptr<ReadCallback> readCallback_;
+  std::shared_ptr<WriteCallback> writeCallback_;
+
+  BlockingConcurrentQueue<Chunk *> recvMsgQueue_;
+  BlockingConcurrentQueue<Chunk *> readMsgQueue_;
+
+  std::vector<std::shared_ptr<RecvWorker>> recvWorkers_;
+  std::shared_ptr<FinalizeWorker> finalizeWorker_;
+  std::vector<std::shared_ptr<ReadWorker>> readWorkers_;
+
+  std::mutex rrcMtx_;
+  std::unordered_map<uint64_t, RequestReply *> rrcMap_;
+  uint64_t time;
+};
+
+#endif  // PMPOOL_PROTOCOL_H_
diff --git a/rpmp/pmpool/RmaBufferRegister.h b/rpmp/pmpool/RmaBufferRegister.h
new file mode 100644
index 00000000..d354f962
--- /dev/null
+++ b/rpmp/pmpool/RmaBufferRegister.h
@@ -0,0 +1,22 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/RmaBuffer.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Tuesday, December 24th 2019, 2:37:40 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#ifndef PMPOOL_RMABUFFERREGISTER_H_
+#define PMPOOL_RMABUFFERREGISTER_H_
+
+#include <HPNL/ChunkMgr.h>
+#include <stdint.h>
+
+class RmaBufferRegister {
+ public:
+  virtual Chunk* register_rma_buffer(char* rma_buffer, uint64_t size) = 0;
+  virtual void unregister_rma_buffer(int buffer_id) = 0;
+};
+
+#endif  // PMPOOL_RMABUFFERREGISTER_H_
diff --git a/rpmp/pmpool/ThreadWrapper.h b/rpmp/pmpool/ThreadWrapper.h
new file mode 100644
index 00000000..769cabea
--- /dev/null
+++ b/rpmp/pmpool/ThreadWrapper.h
@@ -0,0 +1,88 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/ThreadWrapper.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool
+ * Created Date: Thursday, November 7th 2019, 3:48:52 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#ifndef PMPOOL_THREADWRAPPER_H_
+#define PMPOOL_THREADWRAPPER_H_
+
+#include <assert.h>
+
+#include <atomic>
+#include <condition_variable> // NOLINT
+#include <iostream>
+#include <mutex>  // NOLINT
+#include <thread> // NOLINT
+
+class ThreadWrapper {
+ public:
+  ThreadWrapper() : done(false) {}
+  virtual ~ThreadWrapper() = default;
+  void join() {
+    if (thread.joinable()) {
+      thread.join();
+    } else {
+      std::unique_lock<std::mutex> l(join_mutex);
+      join_event.wait(l, [=] { return done.load(); });
+    }
+  }
+  void start(bool background_thread = false) {
+    thread = std::thread(&ThreadWrapper::thread_body, this);
+    if (background_thread) {
+      thread.detach();
+    }
+  }
+  void stop() { done.store(true); }
+  void set_affinity(int cpu) {
+#ifdef __linux__
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    CPU_SET(cpu, &cpuset);
+    int res = pthread_setaffinity_np(thread.native_handle(), sizeof(cpu_set_t),
+                                     &cpuset);
+    if (res) {
+      abort();
+    }
+#endif
+  }
+  void thread_body() {
+    try {
+      while (true) {
+        int ret = entry();
+        if (done.load() || ret == -1) {
+          if (!thread.joinable()) {
+            join_event.notify_all();
+          }
+          break;
+        }
+      }
+    } catch (ThreadAbortException &) {
+      abort();
+    } catch (std::exception &ex) {
+      ExceptionCaught(ex);
+    } catch (...) {
+      UnknownExceptionCaught();
+    }
+  }
+
+ private:
+  class ThreadAbortException : std::exception {};
+
+ protected:
+  virtual int entry() = 0;
+  virtual void abort() = 0;
+  virtual void ExceptionCaught(const std::exception &exception) {}
+  virtual void UnknownExceptionCaught() {}
+
+ private:
+  std::thread thread;
+  std::mutex join_mutex;
+  std::condition_variable join_event;
+  std::atomic_bool done = {false};
+};
+
+#endif  // PMPOOL_THREADWRAPPER_H_
diff --git a/rpmp/pmpool/buffer/CircularBuffer.h b/rpmp/pmpool/buffer/CircularBuffer.h
new file mode 100644
index 00000000..a41ce3ab
--- /dev/null
+++ b/rpmp/pmpool/buffer/CircularBuffer.h
@@ -0,0 +1,217 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/buffer/CircularBuffer.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool/buffer
+ * Created Date: Monday, December 23rd 2019, 2:31:42 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#ifndef PMPOOL_BUFFER_CIRCULARBUFFER_H_
+#define PMPOOL_BUFFER_CIRCULARBUFFER_H_
+
+#include <assert.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <condition_variable>  // NOLINT
+#include <iostream>
+#include <mutex>  // NOLINT
+#include <vector>
+
+#include "../Common.h"
+#include "../NetworkServer.h"
+#include "../RmaBufferRegister.h"
+
+#define p2align(x, a) (((x) + (a)-1) & ~((a)-1))
+
+class CircularBuffer {
+ public:
+  CircularBuffer() = delete;
+  CircularBuffer(const CircularBuffer &) = delete;
+  CircularBuffer(uint64_t buffer_size, uint32_t buffer_num,
+                 bool is_server = false, RmaBufferRegister *rbr = nullptr)
+      : buffer_size_(buffer_size),
+        buffer_num_(buffer_num),
+        rbr_(rbr),
+        read_(0),
+        write_(0) {
+    uint64_t total = buffer_num_ * buffer_size_;
+    buffer_ = static_cast<char *>(mmap(0, buffer_num_ * buffer_size_,
+                                       PROT_READ | PROT_WRITE,
+                                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
+
+    // for the consideration of high performance,
+    // we'd better do memory paging before starting service.
+    // if (is_server) {
+    //  for (uint64_t i = 0; i < total; i++) {
+    //    buffer_[i] = 0;
+    //  }
+    // }
+
+    if (rbr_) {
+      ck_ = rbr_->register_rma_buffer(buffer_, buffer_num_ * buffer_size_);
+    }
+
+    for (int i = 0; i < buffer_num; i++) {
+      bits.push_back(0);
+    }
+  }
+  ~CircularBuffer() {
+    munmap(buffer_, buffer_num_ * buffer_size_);
+    buffer_ = nullptr;
+  }
+  char *get(uint64_t bytes) {
+    uint64_t offset = 0;
+    bool res = get(bytes, &offset);
+    if (res == false) {
+      return nullptr;
+    }
+    return buffer_ + offset * buffer_size_;
+  }
+  void put(const char *data, uint64_t bytes) {
+    assert((data - buffer_) % buffer_size_ == 0);
+    uint64_t offset = (data - buffer_) / buffer_size_;
+    put(offset, bytes);
+  }
+
+  void dump() {
+    std::cout << "********************************************" << std::endl;
+    std::cout << "read_ " << read_ << " write_ " << write_ << std::endl;
+    for (int i = 0; i < buffer_num_; i++) {
+      std::cout << bits[i] << " ";
+    }
+    std::cout << std::endl;
+    std::cout << "********************************************" << std::endl;
+  }
+  uint64_t get_read_() { return read_; }
+  uint64_t get_write_() { return write_; }
+
+  bool get(uint64_t bytes, uint64_t *offset) {
+    uint32_t alloc_num = p2align(bytes, buffer_size_) / buffer_size_;
+    if (alloc_num > buffer_num_) {
+      return false;
+    }
+    std::lock_guard<spin_mutex> write_lk(write_mtx);
+    std::unique_lock<std::mutex> read_lk(read_mtx);
+    uint64_t available = 0;
+    uint64_t end = 0;
+    uint64_t index = 0;
+  read_lt_write:
+    if (write_ >= read_) {  // --------read_--------write_--------
+      available = buffer_num_ - write_;
+      if (available >= alloc_num) {
+        index = write_;
+        end = write_ + alloc_num;
+        while (index < end) {
+          bits[index++] = 1;
+        }
+        *offset = write_;
+        write_ += alloc_num;
+        if (write_ == buffer_num_) {
+          write_ = 0;
+        }
+        goto success;
+      } else {
+        uint64_t index = write_;
+        while (index < buffer_num_) {
+          bits[index++] = 0;
+        }
+        write_ = 0;
+        goto write_lt_read;
+      }
+    }
+  write_lt_read:
+    // --------write_--------read_-----------
+    available = read_ - write_;
+    if (available >= alloc_num) {
+      index = write_;
+      end = write_ + alloc_num;
+      while (index < end) {
+        bits[index++] = 1;
+      }
+      *offset = write_;
+      write_ += alloc_num;
+      if (write_ == buffer_num_) {
+        write_ = 0;
+      }
+      goto success;
+    } else {
+      // wait
+      while ((available = read_ - write_) < alloc_num) {
+        read_cv.wait(read_lk);
+        if (read_ == 0) {
+          goto read_lt_write;
+        }
+      }
+      index = write_;
+      end = write_ + alloc_num;
+      while (index < end) {
+        bits[index++] = 1;
+      }
+      *offset = write_;
+      write_ += alloc_num;
+      if (write_ == buffer_num_) {
+        write_ = 0;
+      }
+      goto success;
+    }
+  success:
+    return true;
+  }
+  void put(uint64_t offset, uint64_t bytes) {
+    uint32_t alloc_num = p2align(bytes, buffer_size_) / buffer_size_;
+    assert(alloc_num <= buffer_num_ - read_);
+    std::unique_lock<std::mutex> read_lk(read_mtx);
+    uint64_t index = offset;
+    uint64_t end = index + alloc_num;
+    while (index < end) {
+      bits[index] = 0;
+      if (read_ == index) {
+        read_++;
+        if (read_ == buffer_num_) {
+          read_ = 0;
+        }
+      }
+      index++;
+      read_cv.notify_all();
+    }
+    index = read_;
+    while (bits[index] == 0) {
+      read_++;
+      index++;
+      if (read_ == buffer_num_) {
+        read_ = 0;
+        read_cv.notify_all();
+        break;
+      } else {
+        read_cv.notify_all();
+      }
+    }
+  }
+  Chunk *get_rma_chunk() { return ck_; }
+  uint64_t get_offset(uint64_t data) { return (data - (uint64_t)buffer_); }
+
+ private:
+  char *buffer_;
+  char *tmp_;
+  uint64_t buffer_size_;
+  uint64_t buffer_num_;
+  RmaBufferRegister *rbr_;
+  Chunk *ck_;
+  std::vector<uint16_t> bits;
+  uint64_t read_;
+  uint64_t write_;
+  std::mutex read_mtx;
+  std::condition_variable read_cv;
+  spin_mutex write_mtx;
+  char tmp[4096];
+};
+
+#endif  // PMPOOL_BUFFER_CIRCULARBUFFER_H_
diff --git a/rpmp/pmpool/client/NetworkClient.cc b/rpmp/pmpool/client/NetworkClient.cc
new file mode 100644
index 00000000..1e32cba9
--- /dev/null
+++ b/rpmp/pmpool/client/NetworkClient.cc
@@ -0,0 +1,262 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/client/NetworkClient.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool/client
+ * Created Date: Monday, December 16th 2019, 1:16:16 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#include "pmpool/client/NetworkClient.h"
+
+#include <HPNL/Callback.h>
+#include <HPNL/ChunkMgr.h>
+#include <HPNL/Connection.h>
+
+#include "../Event.h"
+#include "../buffer/CircularBuffer.h"
+
+uint64_t timestamp_now() {
+  return std::chrono::high_resolution_clock::now().time_since_epoch() /
+         std::chrono::milliseconds(1);
+}
+
+RequestHandler::RequestHandler(NetworkClient *networkClient)
+    : networkClient_(networkClient) {}
+
+void RequestHandler::addTask(Request *request) { handleRequest(request); }
+
+void RequestHandler::addTask(Request *request, std::function<void()> func) {
+  callback_map[request->get_rc().rid] = func;
+  handleRequest(request);
+}
+
+void RequestHandler::wait() {
+  unique_lock<mutex> lk(h_mtx);
+  while (!op_finished) {
+    cv.wait(lk);
+  }
+}
+
+void RequestHandler::notify(RequestReply *requestReply) {
+  unique_lock<mutex> lk(h_mtx);
+  requestReplyContext = requestReply->get_rrc();
+  op_finished = true;
+  if (callback_map.count(requestReplyContext.rid) != 0) {
+    callback_map[requestReplyContext.rid]();
+    callback_map.erase(requestReplyContext.rid);
+  } else {
+    cv.notify_one();
+    lk.unlock();
+  }
+}
+
+void RequestHandler::handleRequest(Request *request) {
+  op_finished = false;
+  OpType rt = request->get_rc().type;
+  switch (rt) {
+    case ALLOC: {
+      request->encode();
+      networkClient_->send(reinterpret_cast<char *>(request->data_),
+                           request->size_);
+      break;
+    }
+    case FREE: {
+      request->encode();
+      networkClient_->send(reinterpret_cast<char *>(request->data_),
+                           request->size_);
+      break;
+    }
+    case WRITE: {
+      request->encode();
+      networkClient_->send(reinterpret_cast<char *>(request->data_),
+                           request->size_);
+      break;
+    }
+    case READ: {
+      request->encode();
+      networkClient_->send(reinterpret_cast<char *>(request->data_),
+                           request->size_);
+      break;
+    }
+    case PUT: {
+      request->encode();
+      networkClient_->send(reinterpret_cast<char *>(request->data_),
+                           request->size_);
+    }
+    case GET_META: {
+      request->encode();
+      networkClient_->send(reinterpret_cast<char *>(request->data_),
+                           request->size_);
+    }
+    default: {}
+  }
+}
+
+RequestReplyContext &RequestHandler::get() { return requestReplyContext; }
+
+ClientConnectedCallback::ClientConnectedCallback(NetworkClient *networkClient) {
+  networkClient_ = networkClient;
+}
+
+void ClientConnectedCallback::operator()(void *param_1, void *param_2) {
+  auto con = static_cast<Connection *>(param_1);
+  networkClient_->connected(con);
+}
+
+ClientRecvCallback::ClientRecvCallback(ChunkMgr *chunkMgr,
+                                       RequestHandler *requestHandler)
+    : chunkMgr_(chunkMgr), requestHandler_(requestHandler) {}
+
+void ClientRecvCallback::operator()(void *param_1, void *param_2) {
+  int mid = *static_cast<int *>(param_1);
+  auto ck = chunkMgr_->get(mid);
+
+  // test start
+  // auto con = reinterpret_cast<Connection*>(ck->con);
+  // if (count_ == 0) {
+  //   start = timestamp_now();
+  // }
+  // count_++;
+  // if (count_ >= 1000000) {
+  //   end = timestamp_now();
+  //   std::cout << "consumes " << (end-start)/1000.0 << std::endl;
+  //   return;
+  // }
+  // RequestContext rc = {};
+  // rc.type = READ;
+  // rc.rid = 0;
+  // rc.size = 0;
+  // rc.address = 0;
+  // Request request(rc);
+  // request.encode();
+  // auto new_ck = chunkMgr_->get(con);
+  // memcpy(new_ck->buffer, request.data_, request.size_);
+  // new_ck->size = request.size_;
+  // con->send(new_ck);
+  // test end
+
+  RequestReply requestReply(reinterpret_cast<char *>(ck->buffer), ck->size,
+                            reinterpret_cast<Connection *>(ck->con));
+  requestReply.decode();
+  RequestReplyContext rrc = requestReply.get_rrc();
+  switch (rrc.type) {
+    case ALLOC_REPLY: {
+      requestHandler_->notify(&requestReply);
+      break;
+    }
+    case FREE_REPLY: {
+      requestHandler_->notify(&requestReply);
+      break;
+    }
+    case WRITE_REPLY: {
+      requestHandler_->notify(&requestReply);
+      break;
+    }
+    case READ_REPLY: {
+      requestHandler_->notify(&requestReply);
+      break;
+    }
+    default: {}
+  }
+  chunkMgr_->reclaim(ck, static_cast<Connection *>(ck->con));
+}
+
+NetworkClient::NetworkClient(const string &remote_address,
+                             const string &remote_port)
+    : NetworkClient(remote_address, remote_port, 1, 32, 65536, 64) {}
+
+NetworkClient::NetworkClient(const string &remote_address,
+                             const string &remote_port, int worker_num,
+                             int buffer_num_per_con, int buffer_size,
+                             int init_buffer_num)
+    : remote_address_(remote_address),
+      remote_port_(remote_port),
+      worker_num_(worker_num),
+      buffer_num_per_con_(buffer_num_per_con),
+      buffer_size_(buffer_size),
+      init_buffer_num_(init_buffer_num),
+      connected_(false) {}
+
+NetworkClient::~NetworkClient() {
+  delete shutdownCallback;
+  delete connectedCallback;
+  delete sendCallback;
+  delete recvCallback;
+}
+
+int NetworkClient::init(RequestHandler *requestHandler) {
+  client_ = new Client(worker_num_, buffer_num_per_con_);
+  if ((client_->init()) != 0) {
+    return -1;
+  }
+  chunkMgr_ = new ChunkPool(client_, buffer_size_, init_buffer_num_);
+
+  client_->set_chunk_mgr(chunkMgr_);
+
+  shutdownCallback = new ClientShutdownCallback();
+  connectedCallback = new ClientConnectedCallback(this);
+  recvCallback = new ClientRecvCallback(chunkMgr_, requestHandler);
+  sendCallback = new ClientSendCallback(chunkMgr_);
+
+  client_->set_shutdown_callback(shutdownCallback);
+  client_->set_connected_callback(connectedCallback);
+  client_->set_recv_callback(recvCallback);
+  client_->set_send_callback(sendCallback);
+
+  client_->start();
+  int res = client_->connect(remote_address_.c_str(), remote_port_.c_str());
+  unique_lock<mutex> lk(con_mtx);
+  while (!connected_) {
+    con_v.wait(lk);
+  }
+
+  circularBuffer_ = make_shared<CircularBuffer>(1024 * 1024, 512, false, this);
+}
+
+void NetworkClient::shutdown() { client_->shutdown(); }
+
+void NetworkClient::wait() { client_->wait(); }
+
+Chunk *NetworkClient::register_rma_buffer(char *rma_buffer, uint64_t size) {
+  return client_->reg_rma_buffer(rma_buffer, size, buffer_id_++);
+}
+
+void NetworkClient::unregister_rma_buffer(int buffer_id) {
+  client_->unreg_rma_buffer(buffer_id);
+}
+
+uint64_t NetworkClient::get_dram_buffer(const char *data, uint64_t size) {
+  char *dest = circularBuffer_->get(size);
+  if (data) {
+    memcpy(dest, data, size);
+  }
+  return (uint64_t)dest;
+}
+
+void NetworkClient::reclaim_dram_buffer(uint64_t src_address, uint64_t size) {
+  circularBuffer_->put(reinterpret_cast<char *>(src_address), size);
+}
+
+uint64_t NetworkClient::get_rkey() {
+  return circularBuffer_->get_rma_chunk()->mr->key;
+}
+
+void NetworkClient::connected(Connection *con) {
+  std::unique_lock<std::mutex> lk(con_mtx);
+  con_ = con;
+  connected_ = true;
+  con_v.notify_all();
+  lk.unlock();
+}
+
+void NetworkClient::send(char *data, uint64_t size) {
+  auto ck = chunkMgr_->get(con_);
+  std::memcpy(reinterpret_cast<char *>(ck->buffer), data, size);
+  ck->size = size;
+  con_->send(ck);
+}
+
+void NetworkClient::read(Request *request) {
+  RequestContext rc = request->get_rc();
+}
diff --git a/rpmp/pmpool/client/NetworkClient.h b/rpmp/pmpool/client/NetworkClient.h
new file mode 100644
index 00000000..f9e4f54c
--- /dev/null
+++ b/rpmp/pmpool/client/NetworkClient.h
@@ -0,0 +1,167 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/client/NetworkClient.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool/client
+ * Created Date: Wednesday, December 11th 2019, 2:02:46 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#ifndef PMPOOL_CLIENT_NETWORKCLIENT_H_
+#define PMPOOL_CLIENT_NETWORKCLIENT_H_
+
+#include <HPNL/Callback.h>
+#include <HPNL/Client.h>
+
+#include <atomic>
+#include <condition_variable>  // NOLINT
+#include <cstring>
+#include <future>  // NOLINT
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "../Event.h"
+#include "../RmaBufferRegister.h"
+#include "../ThreadWrapper.h"
+#include "../queue/blockingconcurrentqueue.h"
+#include "../queue/concurrentqueue.h"
+
+using moodycamel::BlockingConcurrentQueue;
+using std::atomic;
+using std::condition_variable;
+using std::future;
+using std::make_shared;
+using std::mutex;
+using std::promise;
+using std::shared_ptr;
+using std::string;
+using std::unique_lock;
+using std::unordered_map;
+
+class NetworkClient;
+class CircularBuffer;
+class Connection;
+class ChunkMgr;
+
+typedef promise<RequestReplyContext> Promise;
+typedef future<RequestReplyContext> Future;
+
+class RequestHandler {
+ public:
+  explicit RequestHandler(NetworkClient *networkClient);
+  ~RequestHandler() = default;
+  void addTask(Request *request);
+  void addTask(Request *request, std::function<void()> func);
+  void notify(RequestReply *requestReply);
+  void wait();
+  RequestReplyContext &get();
+
+ private:
+  void handleRequest(Request *request);
+
+ private:
+  NetworkClient *networkClient_;
+  BlockingConcurrentQueue<Request *> pendingRequestQueue_;
+  std::mutex h_mtx;
+  unordered_map<uint64_t, std::function<void()>> callback_map;
+  uint64_t total_num = 0;
+  uint64_t begin = 0;
+  uint64_t end = 0;
+  uint64_t time = 0;
+  bool op_finished = false;
+  std::condition_variable cv;
+  RequestReplyContext requestReplyContext;
+};
+
+class ClientShutdownCallback : public Callback {
+ public:
+  ClientShutdownCallback() {}
+  ~ClientShutdownCallback() = default;
+  void operator()(void *param_1, void *param_2) {}
+};
+
+class ClientConnectedCallback : public Callback {
+ public:
+  explicit ClientConnectedCallback(NetworkClient *networkClient);
+  ~ClientConnectedCallback() = default;
+  void operator()(void *param_1, void *param_2);
+
+ private:
+  NetworkClient *networkClient_;
+};
+
+class ClientRecvCallback : public Callback {
+ public:
+  ClientRecvCallback(ChunkMgr *chunkMgr, RequestHandler *requestHandler);
+  ~ClientRecvCallback() = default;
+  void operator()(void *param_1, void *param_2);
+
+ private:
+  ChunkMgr *chunkMgr_;
+  RequestHandler *requestHandler_;
+  uint64_t count_ = 0;
+  uint64_t time = 0;
+  uint64_t start = 0;
+  uint64_t end = 0;
+  std::mutex mtx;
+};
+
+class ClientSendCallback : public Callback {
+ public:
+  explicit ClientSendCallback(ChunkMgr *chunkMgr) : chunkMgr_(chunkMgr) {}
+  ~ClientSendCallback() = default;
+  void operator()(void *param_1, void *param_2) {
+    auto buffer_id_ = *static_cast<int *>(param_1);
+    auto ck = chunkMgr_->get(buffer_id_);
+    chunkMgr_->reclaim(ck, static_cast<Connection *>(ck->con));
+  }
+
+ private:
+  ChunkMgr *chunkMgr_;
+};
+
+class NetworkClient : public RmaBufferRegister {
+ public:
+  friend ClientConnectedCallback;
+  NetworkClient() = delete;
+  NetworkClient(const string &remote_address, const string &remote_port);
+  NetworkClient(const string &remote_address, const string &remote_port,
+                int worker_num, int buffer_num_per_con, int buffer_size,
+                int init_buffer_num);
+  ~NetworkClient();
+  int init(RequestHandler *requesthandler);
+  void shutdown();
+  void wait();
+  Chunk *register_rma_buffer(char *rma_buffer, uint64_t size) override;
+  void unregister_rma_buffer(int buffer_id) override;
+  uint64_t get_dram_buffer(const char *data, uint64_t size);
+  void reclaim_dram_buffer(uint64_t src_address, uint64_t size);
+  uint64_t get_rkey();
+  void connected(Connection *con);
+  void send(char *data, uint64_t size);
+  void read(Request *request);
+
+ private:
+  string remote_address_;
+  string remote_port_;
+  int worker_num_;
+  int buffer_num_per_con_;
+  int buffer_size_;
+  int init_buffer_num_;
+  Client *client_;
+  ChunkMgr *chunkMgr_;
+  Connection *con_;
+  ClientShutdownCallback *shutdownCallback;
+  ClientConnectedCallback *connectedCallback;
+  ClientRecvCallback *recvCallback;
+  ClientSendCallback *sendCallback;
+  mutex con_mtx;
+  bool connected_;
+  condition_variable con_v;
+  shared_ptr<CircularBuffer> circularBuffer_;
+  atomic<uint64_t> buffer_id_{0};
+};
+
+#endif  // PMPOOL_CLIENT_NETWORKCLIENT_H_
diff --git a/rpmp/pmpool/client/PmPoolClient.cc b/rpmp/pmpool/client/PmPoolClient.cc
new file mode 100644
index 00000000..8bc72e1c
--- /dev/null
+++ b/rpmp/pmpool/client/PmPoolClient.cc
@@ -0,0 +1,192 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/client/PmPoolClient.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool/client
+ * Created Date: Friday, December 13th 2019, 3:44:08 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#include "pmpool/client/PmPoolClient.h"
+
+#include "pmpool/Digest.h"
+#include "pmpool/Event.h"
+#include "pmpool/Protocol.h"
+#include "NetworkClient.h"
+
+PmPoolClient::PmPoolClient(const string &remote_address,
+                           const string &remote_port) {
+  tx_finished = true;
+  op_finished = false;
+  networkClient_ = make_shared<NetworkClient>(remote_address, remote_port);
+  requestHandler_ = make_shared<RequestHandler>(networkClient_.get());
+}
+
+PmPoolClient::~PmPoolClient() {}
+
+int PmPoolClient::init() { networkClient_->init(requestHandler_.get()); }
+
+void PmPoolClient::begin_tx() {
+  std::unique_lock<std::mutex> lk(tx_mtx);
+  while (!tx_finished) {
+    tx_con.wait(lk);
+  }
+  tx_finished;
+}
+
+uint64_t PmPoolClient::alloc(uint64_t size) {
+  RequestContext rc = {};
+  rc.type = ALLOC;
+  rc.rid = rid_++;
+  rc.size = size;
+  Request request(rc);
+  requestHandler_->addTask(&request);
+  requestHandler_->wait();
+  return requestHandler_->get().address;
+}
+
+int PmPoolClient::free(uint64_t address) {
+  RequestContext rc = {};
+  rc.type = FREE;
+  rc.rid = rid_++;
+  rc.address = address;
+  Request request(rc);
+  requestHandler_->addTask(&request);
+  requestHandler_->wait();
+  return requestHandler_->get().success;
+}
+
+void PmPoolClient::shutdown() { networkClient_->shutdown(); }
+
+void PmPoolClient::wait() { networkClient_->wait(); }
+
+int PmPoolClient::write(uint64_t address, const char *data, uint64_t size) {
+  RequestContext rc = {};
+  rc.type = WRITE;
+  rc.rid = rid_++;
+  rc.size = size;
+  rc.address = address;
+  // allocate memory for RMA read from client.
+  rc.src_address = networkClient_->get_dram_buffer(data, rc.size);
+  rc.src_rkey = networkClient_->get_rkey();
+  Request request(rc);
+  requestHandler_->addTask(&request);
+  requestHandler_->wait();
+  auto res = requestHandler_->get().success;
+  networkClient_->reclaim_dram_buffer(rc.src_address, rc.size);
+  return res;
+}
+
+uint64_t PmPoolClient::write(const char *data, uint64_t size) {
+  RequestContext rc = {};
+  rc.type = WRITE;
+  rc.rid = rid_++;
+  rc.size = size;
+  rc.address = 0;
+  // allocate memory for RMA read from client.
+  rc.src_address = networkClient_->get_dram_buffer(data, rc.size);
+  rc.src_rkey = networkClient_->get_rkey();
+  Request request(rc);
+  requestHandler_->addTask(&request);
+  requestHandler_->wait();
+  auto res = requestHandler_->get().address;
+  networkClient_->reclaim_dram_buffer(rc.src_address, rc.size);
+  return res;
+}
+
+int PmPoolClient::read(uint64_t address, char *data, uint64_t size) {
+  RequestContext rc = {};
+  rc.type = READ;
+  rc.rid = rid_++;
+  rc.size = size;
+  rc.address = address;
+  // allocate memory for RMA read from client.
+  rc.src_address = networkClient_->get_dram_buffer(nullptr, rc.size);
+  rc.src_rkey = networkClient_->get_rkey();
+  Request request(rc);
+  requestHandler_->addTask(&request);
+  requestHandler_->wait();
+  auto res = requestHandler_->get().success;
+  if (!res) {
+    memcpy(data, reinterpret_cast<char *>(rc.src_address), size);
+  }
+  networkClient_->reclaim_dram_buffer(rc.src_address, rc.size);
+  return res;
+}
+
+int PmPoolClient::read(uint64_t address, char *data, uint64_t size,
+                       std::function<void(int)> func) {
+  RequestContext rc = {};
+  rc.type = READ;
+  rc.rid = rid_++;
+  rc.size = size;
+  rc.address = address;
+  // allocate memory for RMA read from client.
+  rc.src_address = networkClient_->get_dram_buffer(nullptr, rc.size);
+  rc.src_rkey = networkClient_->get_rkey();
+  Request request(rc);
+  requestHandler_->addTask(&request, [&] {
+    auto res = requestHandler_->get().success;
+    if (res) {
+      memcpy(data, reinterpret_cast<char *>(rc.src_address), size);
+    }
+    networkClient_->reclaim_dram_buffer(rc.src_address, rc.size);
+    func(res);
+  });
+  return 0;
+}
+
+void PmPoolClient::end_tx() {
+  std::lock_guard<std::mutex> lk(tx_mtx);
+  tx_finished = true;
+  tx_con.notify_one();
+}
+
+int PmPoolClient::put(const string &key, const char *value, uint64_t size) {
+  uint64_t key_uint;
+  Digest::computeKeyHash(key, &key_uint);
+  RequestContext rc = {};
+  rc.type = PUT;
+  rc.rid = rid_++;
+  rc.size = size;
+  rc.address = 0;
+  // allocate memory for RMA read from client.
+  rc.src_address = networkClient_->get_dram_buffer(value, rc.size);
+  rc.src_rkey = networkClient_->get_rkey();
+  rc.key = key_uint;
+  Request request(rc);
+  requestHandler_->addTask(&request);
+  requestHandler_->wait();
+  auto res = requestHandler_->get().address;
+  networkClient_->reclaim_dram_buffer(rc.src_address, rc.size);
+  return res;
+}
+
+vector<block_meta> PmPoolClient::get(const string &key) {
+  uint64_t key_uint;
+  Digest::computeKeyHash(key, &key_uint);
+  RequestContext rc = {};
+  rc.type = GET_META;
+  rc.rid = rid_++;
+  rc.address = 0;
+  rc.key = key_uint;
+  Request request(rc);
+  requestHandler_->addTask(&request);
+  requestHandler_->wait();
+  auto bml = requestHandler_->get().bml;
+  return bml;
+}
+
+int PmPoolClient::del(const string &key) {
+  uint64_t key_uint;
+  Digest::computeKeyHash(key, &key_uint);
+  RequestContext rc = {};
+  rc.type = DELETE;
+  rc.rid = rid_++;
+  rc.key = key_uint;
+  Request request(rc);
+  requestHandler_->addTask(&request);
+  requestHandler_->wait();
+  auto res = requestHandler_->get().success;
+  return res;
+}
diff --git a/rpmp/pmpool/client/PmPoolClient.h b/rpmp/pmpool/client/PmPoolClient.h
new file mode 100644
index 00000000..3cd2964d
--- /dev/null
+++ b/rpmp/pmpool/client/PmPoolClient.h
@@ -0,0 +1,100 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/pmpool/client/PmPoolClient.h
+ * Path: /mnt/spark-pmof/tool/rpmp/pmpool/client
+ * Created Date: Friday, December 13th 2019, 3:43:04 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#ifndef PMPOOL_CLIENT_PMPOOLCLIENT_H_
+#define PMPOOL_CLIENT_PMPOOLCLIENT_H_
+
+#define INITIAL_BUFFER_NUMBER 64
+
+#include <HPNL/Callback.h>
+#include <HPNL/ChunkMgr.h>
+#include <HPNL/Client.h>
+#include <HPNL/Connection.h>
+
+#include <atomic>
+#include <condition_variable>  // NOLINT
+#include <functional>
+#include <future>  // NOLINT
+#include <iostream>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "../Common.h"
+#include "../Base.h"
+#include "../ThreadWrapper.h"
+
+class NetworkClient;
+class RequestHandler;
+class Function;
+
+using std::atomic;
+using std::make_shared;
+using std::shared_ptr;
+using std::string;
+using std::vector;
+
+class PmPoolClient {
+ public:
+  PmPoolClient() = delete;
+  PmPoolClient(const string &remote_address, const string &remote_port);
+  ~PmPoolClient();
+  int init();
+
+  /// memory pool interface
+  void begin_tx();
+  /// Allocate the given size of memory from remote memory pool.
+  /// Return the global address of memory pool.
+  uint64_t alloc(uint64_t size);
+
+  /// Free memory with the global address.
+  /// Address is the global address that returned by alloc.
+  /// Return 0 if succeed, return others value if fail.
+  int free(uint64_t address);
+
+  /// Write data to the address of remote memory pool.
+  /// The size is number of bytes
+  /// Return 0 if succeed, return others value if fail.
+  int write(uint64_t address, const char *data, uint64_t size);
+
+  /// Return global address if succeed, return -1 if fail.
+  uint64_t write(const char *data, uint64_t size);
+
+  /// Read from the global address of remote memory pool and copy to data
+  /// pointer.
+  /// Return 0 if succeed, return others value if fail.
+  int read(uint64_t address, char *data, uint64_t size);
+
+  int read(uint64_t address, char *data, uint64_t size,
+           std::function<void(int)> func);
+  void end_tx();
+
+  /// key-value storage interface
+  int put(const string &key, const char *value, uint64_t size);
+  vector<block_meta> get(const string &key);
+  int del(const string &key);
+
+  void shutdown();
+  void wait();
+
+ private:
+  shared_ptr<RequestHandler> requestHandler_;
+  shared_ptr<NetworkClient> networkClient_;
+  atomic<uint64_t> rid_ = {0};
+  std::mutex tx_mtx;
+  std::condition_variable tx_con;
+  bool tx_finished;
+  std::mutex op_mtx;
+  bool op_finished;
+};
+
+#endif  // PMPOOL_CLIENT_PMPOOLCLIENT_H_
diff --git a/rpmp/pmpool/client/java/rpmp/.classpath b/rpmp/pmpool/client/java/rpmp/.classpath
new file mode 100644
index 00000000..71f5fefe
--- /dev/null
+++ b/rpmp/pmpool/client/java/rpmp/.classpath
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" output="target/classes" path="src/main/java">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+			<attribute name="test" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="src" path="target/generated-sources/annotations">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+			<attribute name="ignore_optional_problems" value="true"/>
+			<attribute name="m2e-apt" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="src" output="target/test-classes" path="target/generated-test-sources/test-annotations">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+			<attribute name="ignore_optional_problems" value="true"/>
+			<attribute name="m2e-apt" value="true"/>
+			<attribute name="test" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="output" path="target/classes"/>
+</classpath>
diff --git a/rpmp/pmpool/client/java/rpmp/.project b/rpmp/pmpool/client/java/rpmp/.project
new file mode 100644
index 00000000..066a77e0
--- /dev/null
+++ b/rpmp/pmpool/client/java/rpmp/.project
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>rpmp</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.m2e.core.maven2Builder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+		<nature>org.eclipse.m2e.core.maven2Nature</nature>
+	</natures>
+</projectDescription>
diff --git a/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.core.resources.prefs b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.core.resources.prefs
new file mode 100644
index 00000000..f9fe3459
--- /dev/null
+++ b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.core.resources.prefs
@@ -0,0 +1,4 @@
+eclipse.preferences.version=1
+encoding//src/main/java=UTF-8
+encoding//src/test/java=UTF-8
+encoding/<project>=UTF-8
diff --git a/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.jdt.apt.core.prefs b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.jdt.apt.core.prefs
new file mode 100644
index 00000000..d4313d4b
--- /dev/null
+++ b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.jdt.apt.core.prefs
@@ -0,0 +1,2 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.apt.aptEnabled=false
diff --git a/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.jdt.core.prefs b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.jdt.core.prefs
new file mode 100644
index 00000000..b11489fa
--- /dev/null
+++ b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,9 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
+org.eclipse.jdt.core.compiler.compliance=1.7
+org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
+org.eclipse.jdt.core.compiler.processAnnotations=disabled
+org.eclipse.jdt.core.compiler.release=disabled
+org.eclipse.jdt.core.compiler.source=1.7
diff --git a/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.m2e.core.prefs b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.m2e.core.prefs
new file mode 100644
index 00000000..f897a7f1
--- /dev/null
+++ b/rpmp/pmpool/client/java/rpmp/.settings/org.eclipse.m2e.core.prefs
@@ -0,0 +1,4 @@
+activeProfiles=
+eclipse.preferences.version=1
+resolveWorkspaceProjects=true
+version=1
diff --git a/rpmp/pmpool/client/java/rpmp/pom.xml b/rpmp/pmpool/client/java/rpmp/pom.xml
new file mode 100644
index 00000000..65310b2a
--- /dev/null
+++ b/rpmp/pmpool/client/java/rpmp/pom.xml
@@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>com.intel.rpmp</groupId>
+  <artifactId>rpmp</artifactId>
+  <version>0.1</version>
+
+  <name>rpmp</name>
+  <!-- FIXME change it to the project's website -->
+  <url>http://www.example.com</url>
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <maven.compiler.source>1.7</maven.compiler.source>
+    <maven.compiler.target>1.7</maven.compiler.target>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.11</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
+      <plugins>
+        <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
+        <plugin>
+          <artifactId>maven-clean-plugin</artifactId>
+          <version>3.1.0</version>
+        </plugin>
+        <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
+        <plugin>
+          <artifactId>maven-resources-plugin</artifactId>
+          <version>3.0.2</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-compiler-plugin</artifactId>
+          <version>3.8.0</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-surefire-plugin</artifactId>
+          <version>2.22.1</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-jar-plugin</artifactId>
+          <version>3.0.2</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-install-plugin</artifactId>
+          <version>2.5.2</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-deploy-plugin</artifactId>
+          <version>2.8.2</version>
+        </plugin>
+        <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
+        <plugin>
+          <artifactId>maven-site-plugin</artifactId>
+          <version>3.7.1</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-project-info-reports-plugin</artifactId>
+          <version>3.0.0</version>
+        </plugin>
+      </plugins>
+    </pluginManagement>
+  </build>
+</project>
diff --git a/rpmp/pmpool/client/java/rpmp/src/main/java/com/intel/rpmp/PmPoolClient.java b/rpmp/pmpool/client/java/rpmp/src/main/java/com/intel/rpmp/PmPoolClient.java
new file mode 100644
index 00000000..1d431620
--- /dev/null
+++ b/rpmp/pmpool/client/java/rpmp/src/main/java/com/intel/rpmp/PmPoolClient.java
@@ -0,0 +1,115 @@
+package com.intel.rpmp;
+
+import java.io.IOException;
+import java.lang.reflect.Constructor;
+import java.nio.ByteBuffer;
+
+/**
+ * PmPoolClient
+ *
+ */
+public class PmPoolClient {
+    static {
+        System.loadLibrary("pmpool");
+    }
+
+    public PmPoolClient(String remote_address, String remote_port) {
+        objectId = newPmPoolClient_(remote_address, remote_port);
+    }
+
+    public long alloc(long size) {
+        return alloc_(size, objectId);
+    }
+
+    public int free(long address) {
+        return free_(address, objectId);
+    }
+
+    public int write(long address, String data, long size) {
+        return write_(address, data, size, objectId);
+    }
+
+    public long write(String data, long size) {
+        return alloc_and_write_(data, size, objectId);
+    }
+
+    public long write(ByteBuffer data, long size) {
+        return alloc_and_write_(data, size, objectId);
+    }
+
+    public int read(long address, long size, ByteBuffer byteBuffer) {
+        return read_(address, size, byteBuffer, objectId);
+    }
+
+    public int put(String key, ByteBuffer data, long size) {
+        return put(key, data, size, objectId);
+    }
+
+    public int del(String key) {
+        return del(key, objectId);
+    }
+
+    public void shutdown() {
+        shutdown_(objectId);
+    }
+
+    public void waitToStop() {
+        waitToStop_(objectId);
+    }
+
+    public void dispose() {
+        dispose_(objectId);
+    }
+
+    private ByteBuffer convertToByteBuffer(long address, int length) throws IOException {
+        Class<?> classDirectByteBuffer;
+        try {
+            classDirectByteBuffer = Class.forName("java.nio.DirectByteBuffer");
+        } catch (ClassNotFoundException e) {
+            throw new IOException("java.nio.DirectByteBuffer class not found");
+        }
+        Constructor<?> constructor;
+        try {
+            constructor = classDirectByteBuffer.getDeclaredConstructor(long.class, int.class);
+        } catch (NoSuchMethodException e) {
+            throw new IOException("java.nio.DirectByteBuffer constructor not found");
+        }
+        constructor.setAccessible(true);
+        ByteBuffer byteBuffer;
+        try {
+            byteBuffer = (ByteBuffer) constructor.newInstance(address, length);
+        } catch (Exception e) {
+            throw new IOException("java.nio.DirectByteBuffer exception: " + e.toString());
+        }
+
+        return byteBuffer;
+    }
+
+    private native long newPmPoolClient_(String remote_address, String remote_port);
+
+    private native long alloc_(long size, long objectId);
+
+    private native int free_(long address, long objectId);
+
+    private native int write_(long address, String data, long size, long objectId);
+
+    private native long alloc_and_write_(String data, long size, long objectId);
+
+    private native long alloc_and_write_(ByteBuffer data, long size, long objectId);
+
+    private native int put(String key, ByteBuffer data, long size, long objectId);
+
+    private native long[] getMeta(String key, long objectId);
+
+    private native int del(String key, long objectId);
+
+    private native int read_(long address, long size, ByteBuffer byteBuffer, long objectId);
+
+    private native void shutdown_(long objectId);
+
+    private native void waitToStop_(long objectId);
+
+    private native void dispose_(long objectId);
+
+    private long objectId;
+}
diff --git a/rpmp/pmpool/client/java/rpmp/src/test/java/com/intel/rpmp/PmPoolClientTest.java b/rpmp/pmpool/client/java/rpmp/src/test/java/com/intel/rpmp/PmPoolClientTest.java
new file mode 100644
index 00000000..2f4669ad
--- /dev/null
+++ b/rpmp/pmpool/client/java/rpmp/src/test/java/com/intel/rpmp/PmPoolClientTest.java
@@ -0,0 +1,94 @@
+package com.intel.rpmp;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+/**
+ * You need to start rpmp service before running the following tests.
+ */
+public class PmPoolClientTest
+{
+    @Before
+    public void setup() {
+        pmPoolClient = new PmPoolClient("172.168.0.40", "12346");
+    }
+
+    @After
+    public void tear() {
+        pmPoolClient.shutdown();
+        pmPoolClient.waitToStop();
+        pmPoolClient.dispose();
+    }
+
+    @Test
+    public void remoteAlloc() {
+        for (int i = 0; i < 100; i++) {
+            long address = pmPoolClient.alloc(4096);
+            assertTrue(address > 0);
+        }
+    }
+
+    @Test
+    public void remoteWrite() {
+        Random rand = new Random();
+        for (int i = 0; i < 100; i++) {
+            long address = pmPoolClient.alloc(rand.nextInt((1024*1024*8)));
+            assertTrue(address > 0);
+            String data = "hello";
+            assertEquals(0, pmPoolClient.write(address, data, data.length()));
+        }
+    }
+
+
+    @Test
+    public void remoteRead()
+    {
+        long address = pmPoolClient.alloc(4096);
+        assertTrue(address > 0);
+        String data = "hello";
+        assertEquals(0, pmPoolClient.write(address, data, data.length()));
+        ByteBuffer byteBuffer = ByteBuffer.allocateDirect(4096);
+        ByteBuffer testBuffer = ByteBuffer.allocateDirect(4096);
+        for (int i = 0; i < 5; i++) {
+            testBuffer.put(data.getBytes()[i]);
+        }
+        testBuffer.flip();
+        assertEquals(0, pmPoolClient.read(address, 5, byteBuffer));
+        for (int i = 0; i < 5; i++) {
+            assertEquals(true, (char)byteBuffer.get() == (char)testBuffer.get());
+        }
+    }
+
+    @Test
+    public void remoteAllocAndWrite() {
+        for (int i = 0; i < 100; i++) {
+            String data = "hello";
+            assertTrue(pmPoolClient.write(data, data.length()) > 0);
+        }
+    }
+
+    public void remoteAllocAndWriteThenRead()
+    {
+        String data = "hello";
+        long address = pmPoolClient.write(data, data.length());
+        assertTrue(address > 0);
+        ByteBuffer byteBuffer = ByteBuffer.allocateDirect(4096);
+        ByteBuffer testBuffer = ByteBuffer.allocateDirect(4096);
+        for (int i = 0; i < 5; i++) {
+            testBuffer.put(data.getBytes()[i]);
+        }
+        testBuffer.flip();
+        assertEquals(0, pmPoolClient.read(address, 5, byteBuffer));
+        for (int i = 0; i < 5; i++) {
+            assertEquals(true, (char)byteBuffer.get() == (char)testBuffer.get());
+        }
+    }
+
+    private PmPoolClient pmPoolClient;
+}
diff --git a/rpmp/pmpool/client/java/rpmp/target/classes/com/intel/rpmp/PmPoolClient.class b/rpmp/pmpool/client/java/rpmp/target/classes/com/intel/rpmp/PmPoolClient.class
new file mode 100644
index 00000000..2cdedac0
Binary files /dev/null and b/rpmp/pmpool/client/java/rpmp/target/classes/com/intel/rpmp/PmPoolClient.class differ
diff --git a/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst b/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
new file mode 100644
index 00000000..e69de29b
diff --git a/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst b/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
new file mode 100644
index 00000000..ab698a7d
--- /dev/null
+++ b/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
@@ -0,0 +1 @@
+/mnt/spark-pmof/tool/rpmp/pmpool/client/java/rpmp/src/main/java/com/intel/rpmp/PmPoolClient.java
diff --git a/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst b/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst
new file mode 100644
index 00000000..e69de29b
diff --git a/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst b/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst
new file mode 100644
index 00000000..dfd02fe7
--- /dev/null
+++ b/rpmp/pmpool/client/java/rpmp/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst
@@ -0,0 +1 @@
+/mnt/spark-pmof/tool/rpmp/pmpool/client/java/rpmp/src/test/java/com/intel/rpmp/PmPoolClientTest.java
diff --git a/rpmp/pmpool/client/java/rpmp/target/surefire-reports/TEST-com.intel.rpmp.PmPoolClientTest.xml b/rpmp/pmpool/client/java/rpmp/target/surefire-reports/TEST-com.intel.rpmp.PmPoolClientTest.xml
new file mode 100644
index 00000000..33c46b59
--- /dev/null
+++ b/rpmp/pmpool/client/java/rpmp/target/surefire-reports/TEST-com.intel.rpmp.PmPoolClientTest.xml
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<testsuite xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://maven.apache.org/surefire/maven-surefire-plugin/xsd/surefire-test-report.xsd" name="com.intel.rpmp.PmPoolClientTest" time="2.63" tests="4" errors="0" skipped="0" failures="0">
+  <properties>
+    <property name="awt.toolkit" value="sun.awt.X11.XToolkit"/>
+    <property name="file.encoding.pkg" value="sun.io"/>
+    <property name="java.specification.version" value="1.8"/>
+    <property name="sun.cpu.isalist" value=""/>
+    <property name="sun.jnu.encoding" value="UTF-8"/>
+    <property name="java.class.path" value="/mnt/spark-pmof/tool/rpmp/pmpool/client/java/rpmp/target/test-classes:/mnt/spark-pmof/tool/rpmp/pmpool/client/java/rpmp/target/classes:/root/.m2/repository/junit/junit/4.11/junit-4.11.jar:/root/.m2/repository/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar:"/>
+    <property name="java.vm.vendor" value="Oracle Corporation"/>
+    <property name="sun.arch.data.model" value="64"/>
+    <property name="java.vendor.url" value="http://java.oracle.com/"/>
+    <property name="user.timezone" value=""/>
+    <property name="java.vm.specification.version" value="1.8"/>
+    <property name="os.name" value="Linux"/>
+    <property name="user.country" value="US"/>
+    <property name="sun.java.launcher" value="SUN_STANDARD"/>
+    <property name="sun.boot.library.path" value="/opt/jdk1.8.0_192/jre/lib/amd64"/>
+    <property name="sun.java.command" value="/mnt/spark-pmof/tool/rpmp/pmpool/client/java/rpmp/target/surefire/surefirebooter1924503799442628817.jar /mnt/spark-pmof/tool/rpmp/pmpool/client/java/rpmp/target/surefire 2020-02-27T11-36-12_981-jvmRun1 surefire2796554008332215853tmp surefire_05894554083594155349tmp"/>
+    <property name="surefire.test.class.path" value="/mnt/spark-pmof/tool/rpmp/pmpool/client/java/rpmp/target/test-classes:/mnt/spark-pmof/tool/rpmp/pmpool/client/java/rpmp/target/classes:/root/.m2/repository/junit/junit/4.11/junit-4.11.jar:/root/.m2/repository/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar:"/>
+    <property name="sun.cpu.endian" value="little"/>
+    <property name="user.home" value="/root"/>
+    <property name="user.language" value="en"/>
+    <property name="java.specification.vendor" value="Oracle Corporation"/>
+    <property name="java.home" value="/opt/jdk1.8.0_192/jre"/>
+    <property name="basedir" value="/mnt/spark-pmof/tool/rpmp/pmpool/client/java/rpmp"/>
+    <property name="file.separator" value="/"/>
+    <property name="line.separator" value="&#10;"/>
+    <property name="java.vm.specification.vendor" value="Oracle Corporation"/>
+    <property name="java.specification.name" value="Java Platform API Specification"/>
+    <property name="java.awt.graphicsenv" value="sun.awt.X11GraphicsEnvironment"/>
+    <property name="surefire.real.class.path" value="/mnt/spark-pmof/tool/rpmp/pmpool/client/java/rpmp/target/surefire/surefirebooter1924503799442628817.jar"/>
+    <property name="sun.boot.class.path" value="/opt/jdk1.8.0_192/jre/lib/resources.jar:/opt/jdk1.8.0_192/jre/lib/rt.jar:/opt/jdk1.8.0_192/jre/lib/sunrsasign.jar:/opt/jdk1.8.0_192/jre/lib/jsse.jar:/opt/jdk1.8.0_192/jre/lib/jce.jar:/opt/jdk1.8.0_192/jre/lib/charsets.jar:/opt/jdk1.8.0_192/jre/lib/jfr.jar:/opt/jdk1.8.0_192/jre/classes"/>
+    <property name="sun.management.compiler" value="HotSpot 64-Bit Tiered Compilers"/>
+    <property name="java.runtime.version" value="1.8.0_192-b12"/>
+    <property name="user.name" value="root"/>
+    <property name="path.separator" value=":"/>
+    <property name="os.version" value="4.18.19-100.fc27.x86_64"/>
+    <property name="java.endorsed.dirs" value="/opt/jdk1.8.0_192/jre/lib/endorsed"/>
+    <property name="java.runtime.name" value="Java(TM) SE Runtime Environment"/>
+    <property name="file.encoding" value="UTF-8"/>
+    <property name="java.vm.name" value="Java HotSpot(TM) 64-Bit Server VM"/>
+    <property name="localRepository" value="/root/.m2/repository"/>
+    <property name="java.vendor.url.bug" value="http://bugreport.sun.com/bugreport/"/>
+    <property name="java.io.tmpdir" value="/tmp"/>
+    <property name="java.version" value="1.8.0_192"/>
+    <property name="user.dir" value="/mnt/spark-pmof/tool/rpmp/pmpool/client/java/rpmp"/>
+    <property name="os.arch" value="amd64"/>
+    <property name="java.vm.specification.name" value="Java Virtual Machine Specification"/>
+    <property name="java.awt.printerjob" value="sun.print.PSPrinterJob"/>
+    <property name="sun.os.patch.level" value="unknown"/>
+    <property name="java.library.path" value="/usr/local/lib:/usr/local/lib64:/usr/lib:/usr/lib64:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib"/>
+    <property name="java.vm.info" value="mixed mode"/>
+    <property name="java.vendor" value="Oracle Corporation"/>
+    <property name="java.vm.version" value="25.192-b12"/>
+    <property name="java.ext.dirs" value="/opt/jdk1.8.0_192/jre/lib/ext:/usr/java/packages/lib/ext"/>
+    <property name="sun.io.unicode.encoding" value="UnicodeLittle"/>
+    <property name="java.class.version" value="52.0"/>
+  </properties>
+  <testcase name="remoteAlloc" classname="com.intel.rpmp.PmPoolClientTest" time="0.711"/>
+  <testcase name="remoteWrite" classname="com.intel.rpmp.PmPoolClientTest" time="0.835"/>
+  <testcase name="remoteAllocAndWrite" classname="com.intel.rpmp.PmPoolClientTest" time="0.635"/>
+  <testcase name="remoteRead" classname="com.intel.rpmp.PmPoolClientTest" time="0.435"/>
+</testsuite>
\ No newline at end of file
diff --git a/rpmp/pmpool/client/java/rpmp/target/surefire-reports/com.intel.rpmp.PmPoolClientTest.txt b/rpmp/pmpool/client/java/rpmp/target/surefire-reports/com.intel.rpmp.PmPoolClientTest.txt
new file mode 100644
index 00000000..53952175
--- /dev/null
+++ b/rpmp/pmpool/client/java/rpmp/target/surefire-reports/com.intel.rpmp.PmPoolClientTest.txt
@@ -0,0 +1,4 @@
+-------------------------------------------------------------------------------
+Test set: com.intel.rpmp.PmPoolClientTest
+-------------------------------------------------------------------------------
+Tests run: 4, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 2.63 s - in com.intel.rpmp.PmPoolClientTest
diff --git a/rpmp/pmpool/client/java/rpmp/target/test-classes/com/intel/rpmp/PmPoolClientTest.class b/rpmp/pmpool/client/java/rpmp/target/test-classes/com/intel/rpmp/PmPoolClientTest.class
new file mode 100644
index 00000000..d196a320
Binary files /dev/null and b/rpmp/pmpool/client/java/rpmp/target/test-classes/com/intel/rpmp/PmPoolClientTest.class differ
diff --git a/rpmp/pmpool/client/native/com_intel_rpmp_PmPoolClient.cc b/rpmp/pmpool/client/native/com_intel_rpmp_PmPoolClient.cc
new file mode 100644
index 00000000..75cbca8a
--- /dev/null
+++ b/rpmp/pmpool/client/native/com_intel_rpmp_PmPoolClient.cc
@@ -0,0 +1,168 @@
+/*
+ * Filename:
+ * /mnt/spark-pmof/tool/rpmp/pmpool/client/native/PmPoolClientNative.cc Path:
+ * /mnt/spark-pmof/tool/rpmp/pmpool/client/native Created Date: Monday, February
+ * 24th 2020, 9:23:22 pm Author: root
+ *
+ * Copyright (c) 2020 Intel
+ */
+#include <memory>
+
+#include "pmpool/client/PmPoolClient.h"
+#include "pmpool/client/native/com_intel_rpmp_PmPoolClient.h"
+
+JNIEXPORT jlong JNICALL Java_com_intel_rpmp_PmPoolClient_newPmPoolClient_1(
+    JNIEnv *env, jobject obj, jstring address, jstring port) {
+  const char *remote_address = env->GetStringUTFChars(address, 0);
+  const char *remote_port = env->GetStringUTFChars(port, 0);
+
+  PmPoolClient *client = new PmPoolClient(remote_address, remote_port);
+  client->begin_tx();
+  client->init();
+  client->end_tx();
+
+  env->ReleaseStringUTFChars(address, remote_address);
+  env->ReleaseStringUTFChars(port, remote_port);
+
+  return reinterpret_cast<uint64_t>(client);
+}
+
+JNIEXPORT jlong JNICALL Java_com_intel_rpmp_PmPoolClient_alloc_1(
+    JNIEnv *env, jobject obj, jlong size, jlong objectId) {
+  PmPoolClient *client = reinterpret_cast<PmPoolClient *>(objectId);
+  client->begin_tx();
+  uint64_t address = client->alloc(size);
+  client->end_tx();
+  return address;
+}
+
+JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_free_1(JNIEnv *env,
+                                                               jobject obj,
+                                                               jlong address,
+                                                               jlong objectId) {
+  PmPoolClient *client = reinterpret_cast<PmPoolClient *>(objectId);
+  client->begin_tx();
+  int success = client->free(address);
+  client->end_tx();
+  return success;
+}
+
+JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_write_1(
+    JNIEnv *env, jobject obj, jlong address, jstring data, jlong size,
+    jlong objectId) {
+  const char *raw_data = env->GetStringUTFChars(data, 0);
+
+  PmPoolClient *client = reinterpret_cast<PmPoolClient *>(objectId);
+  client->begin_tx();
+  int success = client->write(address, raw_data, size);
+  client->end_tx();
+
+  env->ReleaseStringUTFChars(data, raw_data);
+
+  return success;
+}
+JNIEXPORT jlong JNICALL
+Java_com_intel_rpmp_PmPoolClient_alloc_1and_1write_1__Ljava_lang_String_2JJ(
+    JNIEnv *env, jobject obj, jstring data, jlong size, jlong objectId) {
+  const char *raw_data = env->GetStringUTFChars(data, 0);
+
+  PmPoolClient *client = reinterpret_cast<PmPoolClient *>(objectId);
+  client->begin_tx();
+  uint64_t address = client->write(raw_data, size);
+  client->end_tx();
+
+  env->ReleaseStringUTFChars(data, raw_data);
+  return address;
+}
+
+JNIEXPORT jlong JNICALL
+Java_com_intel_rpmp_PmPoolClient_alloc_1and_1write_1__Ljava_nio_ByteBuffer_2JJ(
+    JNIEnv *env, jobject obj, jobject data, jlong size, jlong objectId) {
+  char *raw_data = static_cast<char *>((*env).GetDirectBufferAddress(data));
+  PmPoolClient *client = reinterpret_cast<PmPoolClient *>(objectId);
+  client->begin_tx();
+  uint64_t address = client->write(raw_data, size);
+  client->end_tx();
+}
+
+JNIEXPORT jint JNICALL
+Java_com_intel_rpmp_PmPoolClient_put(JNIEnv *env, jobject obj, jstring key,
+                                     jobject data, jlong size, jlong objectId) {
+  char *raw_data = static_cast<char *>((*env).GetDirectBufferAddress(data));
+  const char *raw_key = env->GetStringUTFChars(key, 0);
+  PmPoolClient *client = reinterpret_cast<PmPoolClient *>(objectId);
+  client->begin_tx();
+  client->put(raw_key, raw_data, size);
+  client->end_tx();
+  env->ReleaseStringUTFChars(key, raw_key);
+  return 0;
+}
+
+JNIEXPORT jlongArray JNICALL Java_com_intel_rpmp_PmPoolClient_getMeta(
+    JNIEnv *env, jobject obj, jstring key, jlong objectId) {
+  const char *raw_key = env->GetStringUTFChars(key, 0);
+  PmPoolClient *client = reinterpret_cast<PmPoolClient *>(objectId);
+  client->begin_tx();
+  auto bml = client->get(raw_key);
+  client->end_tx();
+  env->ReleaseStringUTFChars(key, raw_key);
+  int longCArraySize = bml.size() * 2;
+  jlongArray longJavaArray = env->NewLongArray(longCArraySize);
+  uint64_t *longCArray =
+      static_cast<uint64_t *>(std::malloc(longCArraySize * sizeof(uint64_t)));
+  if (longJavaArray == nullptr) {
+    return nullptr;
+  }
+  int i = 0;
+  for (auto bm : bml) {
+    longCArray[i++] = bm.address;
+    longCArray[i++] = bm.size;
+  }
+  env->SetLongArrayRegion(longJavaArray, 0, longCArraySize,
+                          reinterpret_cast<jlong *>(longCArray));
+  std::free(longCArray);
+  env->ReleaseStringUTFChars(key, raw_key);
+  return longJavaArray;
+}
+
+JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_del(JNIEnv *env,
+                                                            jobject obj,
+                                                            jstring key,
+                                                            jlong objectId) {
+  const char *raw_key = env->GetStringUTFChars(key, 0);
+  PmPoolClient *client = reinterpret_cast<PmPoolClient *>(objectId);
+  client->begin_tx();
+  int res = client->del(raw_key);
+  client->end_tx();
+  env->ReleaseStringUTFChars(key, raw_key);
+  return res;
+}
+
+JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_read_1(
+    JNIEnv *env, jobject obj, jlong address, jlong size, jobject data,
+    jlong objectId) {
+  char *raw_data = static_cast<char *>((*env).GetDirectBufferAddress(data));
+  PmPoolClient *client = reinterpret_cast<PmPoolClient *>(objectId);
+  client->begin_tx();
+  int success = client->read(address, raw_data, size);
+  client->end_tx();
+  return success;
+}
+
+JNIEXPORT void JNICALL Java_com_intel_rpmp_PmPoolClient_shutdown_1(
+    JNIEnv *env, jobject obj, jlong objectId) {
+  PmPoolClient *client = reinterpret_cast<PmPoolClient *>(objectId);
+  client->shutdown();
+}
+
+JNIEXPORT void JNICALL Java_com_intel_rpmp_PmPoolClient_waitToStop_1(
+    JNIEnv *env, jobject obj, jlong objectId) {
+  PmPoolClient *client = reinterpret_cast<PmPoolClient *>(objectId);
+  client->wait();
+}
+
+JNIEXPORT void JNICALL Java_com_intel_rpmp_PmPoolClient_dispose_1(
+    JNIEnv *env, jobject obj, jlong objectId) {
+  PmPoolClient *client = reinterpret_cast<PmPoolClient *>(objectId);
+  delete client;
+}
diff --git a/rpmp/pmpool/client/native/com_intel_rpmp_PmPoolClient.h b/rpmp/pmpool/client/native/com_intel_rpmp_PmPoolClient.h
new file mode 100644
index 00000000..23f80521
--- /dev/null
+++ b/rpmp/pmpool/client/native/com_intel_rpmp_PmPoolClient.h
@@ -0,0 +1,140 @@
+/*
+ * Filename:
+ * /mnt/spark-pmof/Spark-PMoF/rpmp/pmpool/client/native/com_intel_rpmp_PmPoolClient.h
+ * Path: /mnt/spark-pmof/Spark-PMoF/rpmp/pmpool/client/native
+ * Created Date: Thursday, March 5th 2020, 10:44:12 am
+ * Author: root
+ *
+ * Copyright (c) 2020 Intel
+ */
+
+#include <jni.h>
+/* Header for class com_intel_rpmp_PmPoolClient */
+
+#ifndef PMPOOL_CLIENT_NATIVE_COM_INTEL_RPMP_PMPOOLCLIENT_H_
+#define PMPOOL_CLIENT_NATIVE_COM_INTEL_RPMP_PMPOOLCLIENT_H_
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Class:     com_intel_rpmp_PmPoolClient
+ * Method:    newPmPoolClient_
+ * Signature: (Ljava/lang/String;Ljava/lang/String;)J
+ */
+JNIEXPORT jlong JNICALL Java_com_intel_rpmp_PmPoolClient_newPmPoolClient_1(
+    JNIEnv *, jobject, jstring, jstring);
+
+/*
+ * Class:     com_intel_rpmp_PmPoolClient
+ * Method:    alloc_
+ * Signature: (JJ)J
+ */
+JNIEXPORT jlong JNICALL Java_com_intel_rpmp_PmPoolClient_alloc_1(JNIEnv *,
+                                                                 jobject, jlong,
+                                                                 jlong);
+
+/*
+ * Class:     com_intel_rpmp_PmPoolClient
+ * Method:    free_
+ * Signature: (JJ)I
+ */
+JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_free_1(JNIEnv *,
+                                                               jobject, jlong,
+                                                               jlong);
+
+/*
+ * Class:     com_intel_rpmp_PmPoolClient
+ * Method:    write_
+ * Signature: (JLjava/lang/String;JJ)I
+ */
+JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_write_1(JNIEnv *,
+                                                                jobject, jlong,
+                                                                jstring, jlong,
+                                                                jlong);
+
+/*
+ * Class:     com_intel_rpmp_PmPoolClient
+ * Method:    alloc_and_write_
+ * Signature: (Ljava/lang/String;JJ)J
+ */
+JNIEXPORT jlong JNICALL
+Java_com_intel_rpmp_PmPoolClient_alloc_1and_1write_1__Ljava_lang_String_2JJ(
+    JNIEnv *, jobject, jstring, jlong, jlong);
+
+/*
+ * Class:     com_intel_rpmp_PmPoolClient
+ * Method:    alloc_and_write_
+ * Signature: (Ljava/nio/ByteBuffer;JJ)J
+ */
+JNIEXPORT jlong JNICALL
+Java_com_intel_rpmp_PmPoolClient_alloc_1and_1write_1__Ljava_nio_ByteBuffer_2JJ(
+    JNIEnv *, jobject, jobject, jlong, jlong);
+
+/*
+ * Class:     com_intel_rpmp_PmPoolClient
+ * Method:    put
+ * Signature: (Ljava/lang/String;Ljava/nio/ByteBuffer;JJ)I
+ */
+JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_put(JNIEnv *, jobject,
+                                                            jstring, jobject,
+                                                            jlong, jlong);
+
+/*
+ * Class:     com_intel_rpmp_PmPoolClient
+ * Method:    getMeta
+ * Signature: (Ljava/lang/String;J)[J
+ */
+JNIEXPORT jlongArray JNICALL Java_com_intel_rpmp_PmPoolClient_getMeta(JNIEnv *,
+                                                                      jobject,
+                                                                      jstring,
+                                                                      jlong);
+
+/*
+ * Class:     com_intel_rpmp_PmPoolClient
+ * Method:    del
+ * Signature: (Ljava/lang/String;J)I
+ */
+JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_del(JNIEnv *, jobject,
+                                                            jstring, jlong);
+
+/*
+ * Class:     com_intel_rpmp_PmPoolClient
+ * Method:    read_
+ * Signature: (JJLjava/nio/ByteBuffer;J)I
+ */
+JNIEXPORT jint JNICALL Java_com_intel_rpmp_PmPoolClient_read_1(JNIEnv *,
+                                                               jobject, jlong,
+                                                               jlong, jobject,
+                                                               jlong);
+
+/*
+ * Class:     com_intel_rpmp_PmPoolClient
+ * Method:    shutdown_
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_com_intel_rpmp_PmPoolClient_shutdown_1(JNIEnv *,
+                                                                   jobject,
+                                                                   jlong);
+
+/*
+ * Class:     com_intel_rpmp_PmPoolClient
+ * Method:    waitToStop_
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_com_intel_rpmp_PmPoolClient_waitToStop_1(JNIEnv *,
+                                                                     jobject,
+                                                                     jlong);
+
+/*
+ * Class:     com_intel_rpmp_PmPoolClient
+ * Method:    dispose_
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_com_intel_rpmp_PmPoolClient_dispose_1(JNIEnv *,
+                                                                  jobject,
+                                                                  jlong);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // PMPOOL_CLIENT_NATIVE_COM_INTEL_RPMP_PMPOOLCLIENT_H_
diff --git a/rpmp/pmpool/hash/xxhash.cc b/rpmp/pmpool/hash/xxhash.cc
new file mode 100644
index 00000000..c99bc012
--- /dev/null
+++ b/rpmp/pmpool/hash/xxhash.cc
@@ -0,0 +1,1038 @@
+#pragma clang system_header
+#pragma gcc system_header
+/*
+*  xxHash - Fast Hash algorithm
+*  Copyright (C) 2012-2016, Yann Collet
+*
+*  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions are
+*  met:
+*
+*  * Redistributions of source code must retain the above copyright
+*  notice, this list of conditions and the following disclaimer.
+*  * Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following disclaimer
+*  in the documentation and/or other materials provided with the
+*  distribution.
+*
+*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+*  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+*  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+*  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+*  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+*  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+*  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+*  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*  You can contact the author at :
+*  - xxHash homepage: http://www.xxhash.com
+*  - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!XXH_FORCE_MEMORY_ACCESS :
+* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+* The below switch allow to select different access method for improved performance.
+* Method 0 (default) : use `memcpy()`. Safe and portable.
+* Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+*            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+* Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+*            It can generate buggy code on targets which do not support unaligned memory accesses.
+*            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+* See http://stackoverflow.com/a/32095106/646947 for details.
+* Prefer these methods in priority order (0 > 1 > 2)
+*/
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
+                        || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \
+                        || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
+                    || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
+                    || defined(__ARM_ARCH_7S__) ))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*!XXH_ACCEPT_NULL_INPUT_POINTER :
+* If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault.
+* When this macro is enabled, xxHash actively checks input for null pointer.
+* It it is, result for null input pointers is the same as a null-length input.
+*/
+#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
+#endif
+
+/*!XXH_FORCE_NATIVE_FORMAT :
+* By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
+* Results are therefore identical for little-endian and big-endian CPU.
+* This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+* Should endian-independence be of no importance for your application, you may set the #define below to 1,
+* to improve speed for Big-endian CPU.
+* This option has no impact on Little_Endian CPU.
+*/
+#ifndef XXH_FORCE_NATIVE_FORMAT   /* can be defined externally */
+#  define XXH_FORCE_NATIVE_FORMAT 0
+#endif
+
+/*!XXH_FORCE_ALIGN_CHECK :
+* This is a minor performance trick, only useful with lots of very small keys.
+* It means : check for aligned/unaligned input.
+* The check costs one initial branch per hash;
+* set it to 0 when the input is guaranteed to be aligned,
+* or when alignment doesn't matter for performance.
+*/
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/*! Modify the local functions below should you wish to use some other memory routines
+*   for malloc(), free() */
+#include <stdlib.h>
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void  XXH_free(void* p) { free(p); }
+/*! and for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest, src, size); }
+
+#include <assert.h>   /* assert */
+
+#define XXH_STATIC_LINKING_ONLY
+#include "xxhash/xxhash.h"
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#  define FORCE_INLINE static __forceinline
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/* *************************************
+*  Basic Types
+***************************************/
+#ifndef MEM_MODULE
+# if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+typedef uint8_t  BYTE;
+typedef uint16_t U16;
+typedef uint32_t U32;
+# else
+typedef unsigned char      BYTE;
+typedef unsigned short     U16;
+typedef unsigned int       U32;
+# endif
+#endif
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static U32 XXH_read32(const void* memPtr) { return *(const U32*)memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U32 u32; } __attribute__((packed)) unalign;
+static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+* see : http://stackoverflow.com/a/32095106/646947
+*/
+static U32 XXH_read32(const void* memPtr)
+{
+	U32 val;
+	memcpy(&val, memPtr, sizeof(val));
+	return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#if defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static U32 XXH_swap32(U32 x)
+{
+	return  ((x << 24) & 0xff000000) |
+		((x << 8) & 0x00ff0000) |
+		((x >> 8) & 0x0000ff00) |
+		((x >> 24) & 0x000000ff);
+}
+#endif
+
+
+/* *************************************
+*  Architecture Macros
+***************************************/
+typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+static int XXH_isLittleEndian(void)
+{
+	const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+	return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+	if (align == XXH_unaligned)
+		return endian == XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+	else
+		return endian == XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+}
+
+FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
+{
+	return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+}
+
+static U32 XXH_readBE32(const void* ptr)
+{
+	return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+
+
+/* *************************************
+*  Macros
+***************************************/
+#define XXH_STATIC_ASSERT(c)  { enum { XXH_sa = 1/(int)(!!(c)) }; }  /* use after variable declarations */
+XXH_PUBLIC_API unsigned XXH_versionNumber(void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+static const U32 PRIME32_1 = 2654435761U;   /* 0b10011110001101110111100110110001 */
+static const U32 PRIME32_2 = 2246822519U;   /* 0b10000101111010111100101001110111 */
+static const U32 PRIME32_3 = 3266489917U;   /* 0b11000010101100101010111000111101 */
+static const U32 PRIME32_4 = 668265263U;   /* 0b00100111110101001110101100101111 */
+static const U32 PRIME32_5 = 374761393U;   /* 0b00010110010101100110011110110001 */
+
+static U32 XXH32_round(U32 seed, U32 input)
+{
+	seed += input * PRIME32_2;
+	seed = XXH_rotl32(seed, 13);
+	seed *= PRIME32_1;
+	return seed;
+}
+
+/* mix all bits */
+static U32 XXH32_avalanche(U32 h32)
+{
+	h32 ^= h32 >> 15;
+	h32 *= PRIME32_2;
+	h32 ^= h32 >> 13;
+	h32 *= PRIME32_3;
+	h32 ^= h32 >> 16;
+	return(h32);
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
+static U32
+XXH32_finalize(U32 h32, const void* ptr, size_t len,
+	XXH_endianess endian, XXH_alignment align)
+
+{
+	const BYTE* p = (const BYTE*)ptr;
+
+#define PROCESS1               \
+    h32 += (*p++) * PRIME32_5; \
+    h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+
+#define PROCESS4                         \
+    h32 += XXH_get32bits(p) * PRIME32_3; \
+    p+=4;                                \
+    h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+
+	switch (len & 15)  /* or switch(bEnd - p) */
+	{
+	case 12:      PROCESS4;
+		/* fallthrough */
+	case 8:       PROCESS4;
+		/* fallthrough */
+	case 4:       PROCESS4;
+		return XXH32_avalanche(h32);
+
+	case 13:      PROCESS4;
+		/* fallthrough */
+	case 9:       PROCESS4;
+		/* fallthrough */
+	case 5:       PROCESS4;
+		PROCESS1;
+		return XXH32_avalanche(h32);
+
+	case 14:      PROCESS4;
+		/* fallthrough */
+	case 10:      PROCESS4;
+		/* fallthrough */
+	case 6:       PROCESS4;
+		PROCESS1;
+		PROCESS1;
+		return XXH32_avalanche(h32);
+
+	case 15:      PROCESS4;
+		/* fallthrough */
+	case 11:      PROCESS4;
+		/* fallthrough */
+	case 7:       PROCESS4;
+		/* fallthrough */
+	case 3:       PROCESS1;
+		/* fallthrough */
+	case 2:       PROCESS1;
+		/* fallthrough */
+	case 1:       PROCESS1;
+		/* fallthrough */
+	case 0:       return XXH32_avalanche(h32);
+	}
+	assert(0);
+	return h32;   /* reaching this point is deemed impossible */
+}
+
+
+FORCE_INLINE U32
+XXH32_endian_align(const void* input, size_t len, U32 seed,
+	XXH_endianess endian, XXH_alignment align)
+{
+	const BYTE* p = (const BYTE*)input;
+	const BYTE* bEnd = p + len;
+	U32 h32;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+	if (p == NULL) {
+		len = 0;
+		bEnd = p = (const BYTE*)(size_t)16;
+	}
+#endif
+
+	if (len >= 16) {
+		const BYTE* const limit = bEnd - 15;
+		U32 v1 = seed + PRIME32_1 + PRIME32_2;
+		U32 v2 = seed + PRIME32_2;
+		U32 v3 = seed + 0;
+		U32 v4 = seed - PRIME32_1;
+
+		do {
+			v1 = XXH32_round(v1, XXH_get32bits(p)); p += 4;
+			v2 = XXH32_round(v2, XXH_get32bits(p)); p += 4;
+			v3 = XXH32_round(v3, XXH_get32bits(p)); p += 4;
+			v4 = XXH32_round(v4, XXH_get32bits(p)); p += 4;
+		} while (p < limit);
+
+		h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7)
+			+ XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+	}
+	else {
+		h32 = seed + PRIME32_5;
+	}
+
+	h32 += (U32)len;
+
+	return XXH32_finalize(h32, p, len & 15, endian, align);
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32(const void* input, size_t len, unsigned int seed)
+{
+#if 0
+	/* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+	XXH32_state_t state;
+	XXH32_reset(&state, seed);
+	XXH32_update(&state, input, len);
+	return XXH32_digest(&state);
+#else
+	XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+	if (XXH_FORCE_ALIGN_CHECK) {
+		if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+			if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+				return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+			else
+				return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+		}
+	}
+
+	if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+		return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+	else
+		return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+
+/*======   Hash streaming   ======*/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+	return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+	XXH_free(statePtr);
+	return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+	memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed)
+{
+	XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+	memset(&state, 0, sizeof(state));
+	state.v1 = seed + PRIME32_1 + PRIME32_2;
+	state.v2 = seed + PRIME32_2;
+	state.v3 = seed + 0;
+	state.v4 = seed - PRIME32_1;
+	/* do not write into reserved, planned to be removed in a future version */
+	memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+	return XXH_OK;
+}
+
+
+FORCE_INLINE XXH_errorcode
+XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+	if (input == NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+		return XXH_OK;
+#else
+		return XXH_ERROR;
+#endif
+
+	{   const BYTE* p = (const BYTE*)input;
+	const BYTE* const bEnd = p + len;
+
+	state->total_len_32 += (unsigned)len;
+	state->large_len |= (len >= 16) | (state->total_len_32 >= 16);
+
+	if (state->memsize + len < 16) {   /* fill in tmp buffer */
+		XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
+		state->memsize += (unsigned)len;
+		return XXH_OK;
+	}
+
+	if (state->memsize) {   /* some data left from previous update */
+		XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16 - state->memsize);
+		{   const U32* p32 = state->mem32;
+		state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++;
+		state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++;
+		state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++;
+		state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian));
+		}
+		p += 16 - state->memsize;
+		state->memsize = 0;
+	}
+
+	if (p <= bEnd - 16) {
+		const BYTE* const limit = bEnd - 16;
+		U32 v1 = state->v1;
+		U32 v2 = state->v2;
+		U32 v3 = state->v3;
+		U32 v4 = state->v4;
+
+		do {
+			v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p += 4;
+			v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p += 4;
+			v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p += 4;
+			v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p += 4;
+		} while (p <= limit);
+
+		state->v1 = v1;
+		state->v2 = v2;
+		state->v3 = v3;
+		state->v4 = v4;
+	}
+
+	if (p < bEnd) {
+		XXH_memcpy(state->mem32, p, (size_t)(bEnd - p));
+		state->memsize = (unsigned)(bEnd - p);
+	}
+	}
+
+	return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t* state_in, const void* input, size_t len)
+{
+	XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+	if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+		return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+	else
+		return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+FORCE_INLINE U32
+XXH32_digest_endian(const XXH32_state_t* state, XXH_endianess endian)
+{
+	U32 h32;
+
+	if (state->large_len) {
+		h32 = XXH_rotl32(state->v1, 1)
+			+ XXH_rotl32(state->v2, 7)
+			+ XXH_rotl32(state->v3, 12)
+			+ XXH_rotl32(state->v4, 18);
+	}
+	else {
+		h32 = state->v3 /* == seed */ + PRIME32_5;
+	}
+
+	h32 += state->total_len_32;
+
+	return XXH32_finalize(h32, state->mem32, state->memsize, endian, XXH_aligned);
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32_digest(const XXH32_state_t* state_in)
+{
+	XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+	if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+		return XXH32_digest_endian(state_in, XXH_littleEndian);
+	else
+		return XXH32_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+/*======   Canonical representation   ======*/
+
+/*! Default XXH result types are basic unsigned 32 and 64 bits.
+*   The canonical representation follows human-readable write convention, aka big-endian (large digits first).
+*   These functions allow transformation of hash result into and from its canonical format.
+*   This way, hash values can be written into a file or buffer, remaining comparable across different systems.
+*/
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+	XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+	if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+	memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+	return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+
+/*======   Memory access   ======*/
+
+#ifndef MEM_MODULE
+# define MEM_MODULE
+# if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+typedef uint64_t U64;
+# else
+/* if compiler doesn't support unsigned long long, replace by another 64-bit type */
+typedef unsigned long long U64;
+# endif
+#endif
+
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static U64 XXH_read64(const void* memPtr) { return *(const U64*)memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign64;
+static U64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+* see : http://stackoverflow.com/a/32095106/646947
+*/
+
+static U64 XXH_read64(const void* memPtr)
+{
+	U64 val;
+	memcpy(&val, memPtr, sizeof(val));
+	return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static U64 XXH_swap64(U64 x)
+{
+	return  ((x << 56) & 0xff00000000000000ULL) |
+		((x << 40) & 0x00ff000000000000ULL) |
+		((x << 24) & 0x0000ff0000000000ULL) |
+		((x << 8) & 0x000000ff00000000ULL) |
+		((x >> 8) & 0x00000000ff000000ULL) |
+		((x >> 24) & 0x0000000000ff0000ULL) |
+		((x >> 40) & 0x000000000000ff00ULL) |
+		((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+	if (align == XXH_unaligned)
+		return endian == XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+	else
+		return endian == XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
+}
+
+FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
+{
+	return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+}
+
+static U64 XXH_readBE64(const void* ptr)
+{
+	return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+
+
+/*======   xxh64   ======*/
+
+static const U64 PRIME64_1 = 11400714785074694791ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const U64 PRIME64_2 = 14029467366897019727ULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const U64 PRIME64_3 = 1609587929392839161ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const U64 PRIME64_4 = 9650029242287828579ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const U64 PRIME64_5 = 2870177450012600261ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+static U64 XXH64_round(U64 acc, U64 input)
+{
+	acc += input * PRIME64_2;
+	acc = XXH_rotl64(acc, 31);
+	acc *= PRIME64_1;
+	return acc;
+}
+
+static U64 XXH64_mergeRound(U64 acc, U64 val)
+{
+	val = XXH64_round(0, val);
+	acc ^= val;
+	acc = acc * PRIME64_1 + PRIME64_4;
+	return acc;
+}
+
+static U64 XXH64_avalanche(U64 h64)
+{
+	h64 ^= h64 >> 33;
+	h64 *= PRIME64_2;
+	h64 ^= h64 >> 29;
+	h64 *= PRIME64_3;
+	h64 ^= h64 >> 32;
+	return h64;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+static U64
+XXH64_finalize(U64 h64, const void* ptr, size_t len,
+	XXH_endianess endian, XXH_alignment align)
+{
+	const BYTE* p = (const BYTE*)ptr;
+
+#define PROCESS1_64            \
+    h64 ^= (*p++) * PRIME64_5; \
+    h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+
+#define PROCESS4_64          \
+    h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; \
+    p+=4;                    \
+    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+
+#define PROCESS8_64 {        \
+    U64 const k1 = XXH64_round(0, XXH_get64bits(p)); \
+    p+=8;                    \
+    h64 ^= k1;               \
+    h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \
+}
+
+	switch (len & 31) {
+	case 24: PROCESS8_64;
+		/* fallthrough */
+	case 16: PROCESS8_64;
+		/* fallthrough */
+	case  8: PROCESS8_64;
+		return XXH64_avalanche(h64);
+
+	case 28: PROCESS8_64;
+		/* fallthrough */
+	case 20: PROCESS8_64;
+		/* fallthrough */
+	case 12: PROCESS8_64;
+		/* fallthrough */
+	case  4: PROCESS4_64;
+		return XXH64_avalanche(h64);
+
+	case 25: PROCESS8_64;
+		/* fallthrough */
+	case 17: PROCESS8_64;
+		/* fallthrough */
+	case  9: PROCESS8_64;
+		PROCESS1_64;
+		return XXH64_avalanche(h64);
+
+	case 29: PROCESS8_64;
+		/* fallthrough */
+	case 21: PROCESS8_64;
+		/* fallthrough */
+	case 13: PROCESS8_64;
+		/* fallthrough */
+	case  5: PROCESS4_64;
+		PROCESS1_64;
+		return XXH64_avalanche(h64);
+
+	case 26: PROCESS8_64;
+		/* fallthrough */
+	case 18: PROCESS8_64;
+		/* fallthrough */
+	case 10: PROCESS8_64;
+		PROCESS1_64;
+		PROCESS1_64;
+		return XXH64_avalanche(h64);
+
+	case 30: PROCESS8_64;
+		/* fallthrough */
+	case 22: PROCESS8_64;
+		/* fallthrough */
+	case 14: PROCESS8_64;
+		/* fallthrough */
+	case  6: PROCESS4_64;
+		PROCESS1_64;
+		PROCESS1_64;
+		return XXH64_avalanche(h64);
+
+	case 27: PROCESS8_64;
+		/* fallthrough */
+	case 19: PROCESS8_64;
+		/* fallthrough */
+	case 11: PROCESS8_64;
+		PROCESS1_64;
+		PROCESS1_64;
+		PROCESS1_64;
+		return XXH64_avalanche(h64);
+
+	case 31: PROCESS8_64;
+		/* fallthrough */
+	case 23: PROCESS8_64;
+		/* fallthrough */
+	case 15: PROCESS8_64;
+		/* fallthrough */
+	case  7: PROCESS4_64;
+		/* fallthrough */
+	case  3: PROCESS1_64;
+		/* fallthrough */
+	case  2: PROCESS1_64;
+		/* fallthrough */
+	case  1: PROCESS1_64;
+		/* fallthrough */
+	case  0: return XXH64_avalanche(h64);
+	}
+
+	/* impossible to reach */
+	assert(0);
+	return 0;  /* unreachable, but some compilers complain without it */
+}
+
+FORCE_INLINE U64
+XXH64_endian_align(const void* input, size_t len, U64 seed,
+	XXH_endianess endian, XXH_alignment align)
+{
+	const BYTE* p = (const BYTE*)input;
+	const BYTE* bEnd = p + len;
+	U64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+	if (p == NULL) {
+		len = 0;
+		bEnd = p = (const BYTE*)(size_t)32;
+	}
+#endif
+
+	if (len >= 32) {
+		const BYTE* const limit = bEnd - 32;
+		U64 v1 = seed + PRIME64_1 + PRIME64_2;
+		U64 v2 = seed + PRIME64_2;
+		U64 v3 = seed + 0;
+		U64 v4 = seed - PRIME64_1;
+
+		do {
+			v1 = XXH64_round(v1, XXH_get64bits(p)); p += 8;
+			v2 = XXH64_round(v2, XXH_get64bits(p)); p += 8;
+			v3 = XXH64_round(v3, XXH_get64bits(p)); p += 8;
+			v4 = XXH64_round(v4, XXH_get64bits(p)); p += 8;
+		} while (p <= limit);
+
+		h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+		h64 = XXH64_mergeRound(h64, v1);
+		h64 = XXH64_mergeRound(h64, v2);
+		h64 = XXH64_mergeRound(h64, v3);
+		h64 = XXH64_mergeRound(h64, v4);
+
+	}
+	else {
+		h64 = seed + PRIME64_5;
+	}
+
+	h64 += (U64)len;
+
+	return XXH64_finalize(h64, p, len, endian, align);
+}
+
+
+XXH_PUBLIC_API unsigned long long XXH64(const void* input, size_t len, unsigned long long seed)
+{
+#if 0
+	/* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+	XXH64_state_t state;
+	XXH64_reset(&state, seed);
+	XXH64_update(&state, input, len);
+	return XXH64_digest(&state);
+#else
+	XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+	if (XXH_FORCE_ALIGN_CHECK) {
+		if ((((size_t)input) & 7) == 0) {  /* Input is aligned, let's leverage the speed advantage */
+			if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+				return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+			else
+				return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+		}
+	}
+
+	if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+		return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+	else
+		return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+/*======   Hash Streaming   ======*/
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+	return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+	XXH_free(statePtr);
+	return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+	memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed)
+{
+	XXH64_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+	memset(&state, 0, sizeof(state));
+	state.v1 = seed + PRIME64_1 + PRIME64_2;
+	state.v2 = seed + PRIME64_2;
+	state.v3 = seed + 0;
+	state.v4 = seed - PRIME64_1;
+	/* do not write into reserved, planned to be removed in a future version */
+	memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+	return XXH_OK;
+}
+
+FORCE_INLINE XXH_errorcode
+XXH64_update_endian(XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+	if (input == NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+		return XXH_OK;
+#else
+		return XXH_ERROR;
+#endif
+
+	{   const BYTE* p = (const BYTE*)input;
+	const BYTE* const bEnd = p + len;
+
+	state->total_len += len;
+
+	if (state->memsize + len < 32) {  /* fill in tmp buffer */
+		XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+		state->memsize += (U32)len;
+		return XXH_OK;
+	}
+
+	if (state->memsize) {   /* tmp buffer is full */
+		XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32 - state->memsize);
+		state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64 + 0, endian));
+		state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64 + 1, endian));
+		state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64 + 2, endian));
+		state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64 + 3, endian));
+		p += 32 - state->memsize;
+		state->memsize = 0;
+	}
+
+	if (p + 32 <= bEnd) {
+		const BYTE* const limit = bEnd - 32;
+		U64 v1 = state->v1;
+		U64 v2 = state->v2;
+		U64 v3 = state->v3;
+		U64 v4 = state->v4;
+
+		do {
+			v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p += 8;
+			v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p += 8;
+			v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p += 8;
+			v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p += 8;
+		} while (p <= limit);
+
+		state->v1 = v1;
+		state->v2 = v2;
+		state->v3 = v3;
+		state->v4 = v4;
+	}
+
+	if (p < bEnd) {
+		XXH_memcpy(state->mem64, p, (size_t)(bEnd - p));
+		state->memsize = (unsigned)(bEnd - p);
+	}
+	}
+
+	return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t* state_in, const void* input, size_t len)
+{
+	XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+	if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+		return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
+	else
+		return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+FORCE_INLINE U64 XXH64_digest_endian(const XXH64_state_t* state, XXH_endianess endian)
+{
+	U64 h64;
+
+	if (state->total_len >= 32) {
+		U64 const v1 = state->v1;
+		U64 const v2 = state->v2;
+		U64 const v3 = state->v3;
+		U64 const v4 = state->v4;
+
+		h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+		h64 = XXH64_mergeRound(h64, v1);
+		h64 = XXH64_mergeRound(h64, v2);
+		h64 = XXH64_mergeRound(h64, v3);
+		h64 = XXH64_mergeRound(h64, v4);
+	}
+	else {
+		h64 = state->v3 /*seed*/ + PRIME64_5;
+	}
+
+	h64 += (U64)state->total_len;
+
+	return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian, XXH_aligned);
+}
+
+XXH_PUBLIC_API unsigned long long XXH64_digest(const XXH64_state_t* state_in)
+{
+	XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+	if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+		return XXH64_digest_endian(state_in, XXH_littleEndian);
+	else
+		return XXH64_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+/*====== Canonical representation   ======*/
+
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+	XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+	if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+	memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+	return XXH_readBE64(src);
+}
+
+#endif 
diff --git a/rpmp/pmpool/queue/blockingconcurrentqueue.h b/rpmp/pmpool/queue/blockingconcurrentqueue.h
new file mode 100644
index 00000000..c855f9df
--- /dev/null
+++ b/rpmp/pmpool/queue/blockingconcurrentqueue.h
@@ -0,0 +1,981 @@
+// Provides an efficient blocking version of moodycamel::ConcurrentQueue.
+// ©2015-2016 Cameron Desrochers. Distributed under the terms of the simplified
+// BSD license, available at the top of concurrentqueue.h.
+// Uses Jeff Preshing's semaphore implementation (under the terms of its
+// separate zlib license, embedded below).
+
+#pragma once
+
+#include "concurrentqueue.h"
+#include <type_traits>
+#include <cerrno>
+#include <memory>
+#include <chrono>
+#include <ctime>
+
+#if defined(_WIN32)
+// Avoid including windows.h in a header; we only need a handful of
+// items, so we'll redeclare them here (this is relatively safe since
+// the API generally has to remain stable between Windows versions).
+// I know this is an ugly hack but it still beats polluting the global
+// namespace with thousands of generic names or adding a .cpp for nothing.
+extern "C" {
+	struct _SECURITY_ATTRIBUTES;
+	__declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName);
+	__declspec(dllimport) int __stdcall CloseHandle(void* hObject);
+	__declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
+	__declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
+}
+#elif defined(__MACH__)
+#include <mach/mach.h>
+#elif defined(__unix__)
+#include <semaphore.h>
+#endif
+
+namespace moodycamel
+{
+namespace details
+{
+	// Code in the mpmc_sema namespace below is an adaptation of Jeff Preshing's
+	// portable + lightweight semaphore implementations, originally from
+	// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
+	// LICENSE:
+	// Copyright (c) 2015 Jeff Preshing
+	//
+	// This software is provided 'as-is', without any express or implied
+	// warranty. In no event will the authors be held liable for any damages
+	// arising from the use of this software.
+	//
+	// Permission is granted to anyone to use this software for any purpose,
+	// including commercial applications, and to alter it and redistribute it
+	// freely, subject to the following restrictions:
+	//
+	// 1. The origin of this software must not be misrepresented; you must not
+	//	claim that you wrote the original software. If you use this software
+	//	in a product, an acknowledgement in the product documentation would be
+	//	appreciated but is not required.
+	// 2. Altered source versions must be plainly marked as such, and must not be
+	//	misrepresented as being the original software.
+	// 3. This notice may not be removed or altered from any source distribution.
+	namespace mpmc_sema
+	{
+#if defined(_WIN32)
+		class Semaphore
+		{
+		private:
+			void* m_hSema;
+			
+			Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+			Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+		public:
+			Semaphore(int initialCount = 0)
+			{
+				assert(initialCount >= 0);
+				const long maxLong = 0x7fffffff;
+				m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
+			}
+
+			~Semaphore()
+			{
+				CloseHandle(m_hSema);
+			}
+
+			void wait()
+			{
+				const unsigned long infinite = 0xffffffff;
+				WaitForSingleObject(m_hSema, infinite);
+			}
+			
+			bool try_wait()
+			{
+				const unsigned long RC_WAIT_TIMEOUT = 0x00000102;
+				return WaitForSingleObject(m_hSema, 0) != RC_WAIT_TIMEOUT;
+			}
+			
+			bool timed_wait(std::uint64_t usecs)
+			{
+				const unsigned long RC_WAIT_TIMEOUT = 0x00000102;
+				return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) != RC_WAIT_TIMEOUT;
+			}
+
+			void signal(int count = 1)
+			{
+				ReleaseSemaphore(m_hSema, count, nullptr);
+			}
+		};
+#elif defined(__MACH__)
+		//---------------------------------------------------------
+		// Semaphore (Apple iOS and OSX)
+		// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
+		//---------------------------------------------------------
+		class Semaphore
+		{
+		private:
+			semaphore_t m_sema;
+
+			Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+			Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+		public:
+			Semaphore(int initialCount = 0)
+			{
+				assert(initialCount >= 0);
+				semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
+			}
+
+			~Semaphore()
+			{
+				semaphore_destroy(mach_task_self(), m_sema);
+			}
+
+			void wait()
+			{
+				semaphore_wait(m_sema);
+			}
+			
+			bool try_wait()
+			{
+				return timed_wait(0);
+			}
+			
+			bool timed_wait(std::uint64_t timeout_usecs)
+			{
+				mach_timespec_t ts;
+				ts.tv_sec = static_cast<unsigned int>(timeout_usecs / 1000000);
+				ts.tv_nsec = (timeout_usecs % 1000000) * 1000;
+
+				// added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html
+				kern_return_t rc = semaphore_timedwait(m_sema, ts);
+
+				return rc != KERN_OPERATION_TIMED_OUT && rc != KERN_ABORTED;
+			}
+
+			void signal()
+			{
+				semaphore_signal(m_sema);
+			}
+
+			void signal(int count)
+			{
+				while (count-- > 0)
+				{
+					semaphore_signal(m_sema);
+				}
+			}
+		};
+#elif defined(__unix__)
+		//---------------------------------------------------------
+		// Semaphore (POSIX, Linux)
+		//---------------------------------------------------------
+		class Semaphore
+		{
+		private:
+			sem_t m_sema;
+
+			Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+			Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+		public:
+			Semaphore(int initialCount = 0)
+			{
+				assert(initialCount >= 0);
+				sem_init(&m_sema, 0, initialCount);
+			}
+
+			~Semaphore()
+			{
+				sem_destroy(&m_sema);
+			}
+
+			void wait()
+			{
+				// http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
+				int rc;
+				do {
+					rc = sem_wait(&m_sema);
+				} while (rc == -1 && errno == EINTR);
+			}
+
+			bool try_wait()
+			{
+				int rc;
+				do {
+					rc = sem_trywait(&m_sema);
+				} while (rc == -1 && errno == EINTR);
+				return !(rc == -1 && errno == EAGAIN);
+			}
+
+			bool timed_wait(std::uint64_t usecs)
+			{
+				struct timespec ts;
+				const int usecs_in_1_sec = 1000000;
+				const int nsecs_in_1_sec = 1000000000;
+				clock_gettime(CLOCK_REALTIME, &ts);
+				ts.tv_sec += usecs / usecs_in_1_sec;
+				ts.tv_nsec += (usecs % usecs_in_1_sec) * 1000;
+				// sem_timedwait bombs if you have more than 1e9 in tv_nsec
+				// so we have to clean things up before passing it in
+				if (ts.tv_nsec >= nsecs_in_1_sec) {
+					ts.tv_nsec -= nsecs_in_1_sec;
+					++ts.tv_sec;
+				}
+
+				int rc;
+				do {
+					rc = sem_timedwait(&m_sema, &ts);
+				} while (rc == -1 && errno == EINTR);
+				return !(rc == -1 && errno == ETIMEDOUT);
+			}
+
+			void signal()
+			{
+				sem_post(&m_sema);
+			}
+
+			void signal(int count)
+			{
+				while (count-- > 0)
+				{
+					sem_post(&m_sema);
+				}
+			}
+		};
+#else
+#error Unsupported platform! (No semaphore wrapper available)
+#endif
+
+		//---------------------------------------------------------
+		// LightweightSemaphore
+		//---------------------------------------------------------
+		class LightweightSemaphore
+		{
+		public:
+			typedef std::make_signed<std::size_t>::type ssize_t;
+
+		private:
+			std::atomic<ssize_t> m_count;
+			Semaphore m_sema;
+
+			bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1)
+			{
+				ssize_t oldCount;
+				// Is there a better way to set the initial spin count?
+				// If we lower it to 1000, testBenaphore becomes 15x slower on my Core i7-5930K Windows PC,
+				// as threads start hitting the kernel semaphore.
+				int spin = 10000;
+				while (--spin >= 0)
+				{
+					oldCount = m_count.load(std::memory_order_relaxed);
+					if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
+						return true;
+					std::atomic_signal_fence(std::memory_order_acquire);	 // Prevent the compiler from collapsing the loop.
+				}
+				oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+				if (oldCount > 0)
+					return true;
+				if (timeout_usecs < 0)
+				{
+					m_sema.wait();
+					return true;
+				}
+				if (m_sema.timed_wait((std::uint64_t)timeout_usecs))
+					return true;
+				// At this point, we've timed out waiting for the semaphore, but the
+				// count is still decremented indicating we may still be waiting on
+				// it. So we have to re-adjust the count, but only if the semaphore
+				// wasn't signaled enough times for us too since then. If it was, we
+				// need to release the semaphore too.
+				while (true)
+				{
+					oldCount = m_count.load(std::memory_order_acquire);
+					if (oldCount >= 0 && m_sema.try_wait())
+						return true;
+					if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed))
+						return false;
+				}
+			}
+
+			ssize_t waitManyWithPartialSpinning(ssize_t max, std::int64_t timeout_usecs = -1)
+			{
+				assert(max > 0);
+				ssize_t oldCount;
+				int spin = 10000;
+				while (--spin >= 0)
+				{
+					oldCount = m_count.load(std::memory_order_relaxed);
+					if (oldCount > 0)
+					{
+						ssize_t newCount = oldCount > max ? oldCount - max : 0;
+						if (m_count.compare_exchange_strong(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+							return oldCount - newCount;
+					}
+					std::atomic_signal_fence(std::memory_order_acquire);
+				}
+				oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+				if (oldCount <= 0)
+				{
+					if (timeout_usecs < 0)
+						m_sema.wait();
+					else if (!m_sema.timed_wait((std::uint64_t)timeout_usecs))
+					{
+						while (true)
+						{
+							oldCount = m_count.load(std::memory_order_acquire);
+							if (oldCount >= 0 && m_sema.try_wait())
+								break;
+							if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed))
+								return 0;
+						}
+					}
+				}
+				if (max > 1)
+					return 1 + tryWaitMany(max - 1);
+				return 1;
+			}
+
+		public:
+			LightweightSemaphore(ssize_t initialCount = 0) : m_count(initialCount)
+			{
+				assert(initialCount >= 0);
+			}
+
+			bool tryWait()
+			{
+				ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+				while (oldCount > 0)
+				{
+					if (m_count.compare_exchange_weak(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
+						return true;
+				}
+				return false;
+			}
+
+			void wait()
+			{
+				if (!tryWait())
+					waitWithPartialSpinning();
+			}
+
+			bool wait(std::int64_t timeout_usecs)
+			{
+				return tryWait() || waitWithPartialSpinning(timeout_usecs);
+			}
+
+			// Acquires between 0 and (greedily) max, inclusive
+			ssize_t tryWaitMany(ssize_t max)
+			{
+				assert(max >= 0);
+				ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+				while (oldCount > 0)
+				{
+					ssize_t newCount = oldCount > max ? oldCount - max : 0;
+					if (m_count.compare_exchange_weak(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+						return oldCount - newCount;
+				}
+				return 0;
+			}
+
+			// Acquires at least one, and (greedily) at most max
+			ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs)
+			{
+				assert(max >= 0);
+				ssize_t result = tryWaitMany(max);
+				if (result == 0 && max > 0)
+					result = waitManyWithPartialSpinning(max, timeout_usecs);
+				return result;
+			}
+			
+			ssize_t waitMany(ssize_t max)
+			{
+				ssize_t result = waitMany(max, -1);
+				assert(result > 0);
+				return result;
+			}
+
+			void signal(ssize_t count = 1)
+			{
+				assert(count >= 0);
+				ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release);
+				ssize_t toRelease = -oldCount < count ? -oldCount : count;
+				if (toRelease > 0)
+				{
+					m_sema.signal((int)toRelease);
+				}
+			}
+			
+			ssize_t availableApprox() const
+			{
+				ssize_t count = m_count.load(std::memory_order_relaxed);
+				return count > 0 ? count : 0;
+			}
+		};
+	}	// end namespace mpmc_sema
+}	// end namespace details
+
+
+// This is a blocking version of the queue. It has an almost identical interface to
+// the normal non-blocking version, with the addition of various wait_dequeue() methods
+// and the removal of producer-specific dequeue methods.
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class BlockingConcurrentQueue
+{
+private:
+	typedef ::moodycamel::ConcurrentQueue<T, Traits> ConcurrentQueue;
+	typedef details::mpmc_sema::LightweightSemaphore LightweightSemaphore;
+
+public:
+	typedef typename ConcurrentQueue::producer_token_t producer_token_t;
+	typedef typename ConcurrentQueue::consumer_token_t consumer_token_t;
+	
+	typedef typename ConcurrentQueue::index_t index_t;
+	typedef typename ConcurrentQueue::size_t size_t;
+	typedef typename std::make_signed<size_t>::type ssize_t;
+	
+	static const size_t BLOCK_SIZE = ConcurrentQueue::BLOCK_SIZE;
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = ConcurrentQueue::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD;
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::EXPLICIT_INITIAL_INDEX_SIZE;
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::IMPLICIT_INITIAL_INDEX_SIZE;
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = ConcurrentQueue::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = ConcurrentQueue::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE;
+	static const size_t MAX_SUBQUEUE_SIZE = ConcurrentQueue::MAX_SUBQUEUE_SIZE;
+	
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit BlockingConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+		: inner(capacity), sema(create<LightweightSemaphore>(), &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+	{
+		assert(reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member");
+		if (!sema) {
+			MOODYCAMEL_THROW(std::bad_alloc());
+		}
+	}
+	
+	BlockingConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+		: inner(minCapacity, maxExplicitProducers, maxImplicitProducers), sema(create<LightweightSemaphore>(), &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+	{
+		assert(reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member");
+		if (!sema) {
+			MOODYCAMEL_THROW(std::bad_alloc());
+		}
+	}
+	
+	// Disable copying and copy assignment
+	BlockingConcurrentQueue(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	BlockingConcurrentQueue& operator=(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	BlockingConcurrentQueue(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: inner(std::move(other.inner)), sema(std::move(other.sema))
+	{ }
+	
+	inline BlockingConcurrentQueue& operator=(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+	
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(BlockingConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+	
+private:
+	BlockingConcurrentQueue& swap_internal(BlockingConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+		
+		inner.swap(other.inner);
+		sema.swap(other.sema);
+		return *this;
+	}
+	
+public:
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		if ((details::likely)(inner.enqueue(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		if ((details::likely)(inner.enqueue(std::move(item)))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		if ((details::likely)(inner.enqueue(token, item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		if ((details::likely)(inner.enqueue(token, std::move(item)))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		if ((details::likely)(inner.enqueue_bulk(std::forward<It>(itemFirst), count))) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		if ((details::likely)(inner.enqueue_bulk(token, std::forward<It>(itemFirst), count))) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		if (inner.try_enqueue(item)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		if (inner.try_enqueue(std::move(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		if (inner.try_enqueue(token, item)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		if (inner.try_enqueue(token, std::move(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		if (inner.try_enqueue_bulk(std::forward<It>(itemFirst), count)) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		if (inner.try_enqueue_bulk(token, std::forward<It>(itemFirst), count)) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue(U& item)
+	{
+		if (sema->tryWait()) {
+			while (!inner.try_dequeue(item)) {
+				continue;
+			}
+			return true;
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		if (sema->tryWait()) {
+			while (!inner.try_dequeue(token, item)) {
+				continue;
+			}
+			return true;
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	
+	
+	// Blocks the current thread until there's something to dequeue, then
+	// dequeues it.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline void wait_dequeue(U& item)
+	{
+		sema->wait();
+		while (!inner.try_dequeue(item)) {
+			continue;
+		}
+	}
+
+	// Blocks the current thread until either there's something to dequeue
+	// or the timeout (specified in microseconds) expires. Returns false
+	// without setting `item` if the timeout expires, otherwise assigns
+	// to `item` and returns true.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool wait_dequeue_timed(U& item, std::int64_t timeout_usecs)
+	{
+		if (!sema->wait(timeout_usecs)) {
+			return false;
+		}
+		while (!inner.try_dequeue(item)) {
+			continue;
+		}
+		return true;
+	}
+    
+    // Blocks the current thread until either there's something to dequeue
+	// or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+	// Never allocates. Thread-safe.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(U& item, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Blocks the current thread until there's something to dequeue, then
+	// dequeues it using an explicit consumer token.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline void wait_dequeue(consumer_token_t& token, U& item)
+	{
+		sema->wait();
+		while (!inner.try_dequeue(token, item)) {
+			continue;
+		}
+	}
+	
+	// Blocks the current thread until either there's something to dequeue
+	// or the timeout (specified in microseconds) expires. Returns false
+	// without setting `item` if the timeout expires, otherwise assigns
+	// to `item` and returns true.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::int64_t timeout_usecs)
+	{
+		if (!sema->wait(timeout_usecs)) {
+			return false;
+		}
+		while (!inner.try_dequeue(token, item)) {
+			continue;
+		}
+		return true;
+	}
+    
+    // Blocks the current thread until either there's something to dequeue
+	// or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+	// Never allocates. Thread-safe.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(token, item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which will
+	// always be at least one (this method blocks until the queue
+	// is non-empty) and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue_bulk.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::int64_t timeout_usecs)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+    
+    // Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It, typename Rep, typename Period>
+	inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(itemFirst, max, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which will
+	// always be at least one (this method blocks until the queue
+	// is non-empty) and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue_bulk.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::int64_t timeout_usecs)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It, typename Rep, typename Period>
+	inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(token, itemFirst, max, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	inline size_t size_approx() const
+	{
+		return (size_t)sema->availableApprox();
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static bool is_lock_free()
+	{
+		return ConcurrentQueue::is_lock_free();
+	}
+	
+
+private:
+	template<typename U>
+	static inline U* create()
+	{
+		auto p = (Traits::malloc)(sizeof(U));
+		return p != nullptr ? new (p) U : nullptr;
+	}
+	
+	template<typename U, typename A1>
+	static inline U* create(A1&& a1)
+	{
+		auto p = (Traits::malloc)(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+	}
+	
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr) {
+			p->~U();
+		}
+		(Traits::free)(p);
+	}
+	
+private:
+	ConcurrentQueue inner;
+	std::unique_ptr<LightweightSemaphore, void (*)(LightweightSemaphore*)> sema;
+};
+
+
+template<typename T, typename Traits>
+inline void swap(BlockingConcurrentQueue<T, Traits>& a, BlockingConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}	// end namespace moodycamel
diff --git a/rpmp/pmpool/queue/concurrentqueue.h b/rpmp/pmpool/queue/concurrentqueue.h
new file mode 100644
index 00000000..21cb9375
--- /dev/null
+++ b/rpmp/pmpool/queue/concurrentqueue.h
@@ -0,0 +1,3636 @@
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
+// An overview, including benchmark results, is provided here:
+//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
+// The full design is also described in excruciating detail at:
+//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
+
+// Simplified BSD license:
+// Copyright (c) 2013-2016, Cameron Desrochers.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice, this list of
+// conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice, this list of
+// conditions and the following disclaimer in the documentation and/or other materials
+// provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#pragma once
+
+#if defined(__GNUC__)
+// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
+// upon assigning any computed values)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+
+#ifdef MCDBGQ_USE_RELACY
+#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
+#endif
+#endif
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#endif
+
+#ifdef MCDBGQ_USE_RELACY
+#include "relacy/relacy_std.hpp"
+#include "relacy_shims.h"
+// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
+// We'll override the default trait malloc ourselves without a macro.
+#undef new
+#undef delete
+#undef malloc
+#undef free
+#else
+#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <cassert>
+#endif
+#include <cstddef>              // for max_align_t
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <climits>		// for CHAR_BIT
+#include <array>
+#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+
+// Platform-specific definitions of a numeric thread ID type and an invalid value
+namespace moodycamel { namespace details {
+	template<typename thread_id_t> struct thread_id_converter {
+		typedef thread_id_t thread_id_numeric_size_t;
+		typedef thread_id_t thread_id_hash_t;
+		static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
+	};
+} }
+#if defined(MCDBGQ_USE_RELACY)
+namespace moodycamel { namespace details {
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+	static inline thread_id_t thread_id() { return rl::thread_index(); }
+} }
+#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
+// No sense pulling in windows.h in a header, we'll manually declare the function
+// we use and rely on backwards-compatibility for this not to break
+extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
+namespace moodycamel { namespace details {
+	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
+	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
+} }
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE)
+namespace moodycamel { namespace details {
+	static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
+	
+	typedef std::thread::id thread_id_t;
+	static const thread_id_t invalid_thread_id;         // Default ctor creates invalid ID
+
+	// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
+	// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
+	// be.
+	static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
+
+	template<std::size_t> struct thread_id_size { };
+	template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; };
+	template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; };
+
+	template<> struct thread_id_converter<thread_id_t> {
+		typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+#ifndef __APPLE__
+		typedef std::size_t thread_id_hash_t;
+#else
+		typedef thread_id_numeric_size_t thread_id_hash_t;
+#endif
+
+		static thread_id_hash_t prehash(thread_id_t const& x)
+		{
+#ifndef __APPLE__
+			return std::hash<std::thread::id>()(x);
+#else
+			return *reinterpret_cast<thread_id_hash_t const*>(&x);
+#endif
+		}
+	};
+} }
+#else
+// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
+// In order to get a numeric thread ID in a platform-independent way, we use a thread-local
+// static variable's address as a thread identifier :-)
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define MOODYCAMEL_THREADLOCAL __thread
+#elif defined(_MSC_VER)
+#define MOODYCAMEL_THREADLOCAL __declspec(thread)
+#else
+// Assume C++11 compliant compiler
+#define MOODYCAMEL_THREADLOCAL thread_local
+#endif
+namespace moodycamel { namespace details {
+	typedef std::uintptr_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
+	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
+	static inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
+} }
+#endif
+
+// Exceptions
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#endif
+#endif
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+#define MOODYCAMEL_TRY try
+#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
+#define MOODYCAMEL_RETHROW throw
+#define MOODYCAMEL_THROW(expr) throw (expr)
+#else
+#define MOODYCAMEL_TRY if (true)
+#define MOODYCAMEL_CATCH(...) else if (false)
+#define MOODYCAMEL_RETHROW
+#define MOODYCAMEL_THROW(expr)
+#endif
+
+#ifndef MOODYCAMEL_NOEXCEPT
+#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
+#define MOODYCAMEL_NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
+// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
+// We have to assume *all* non-trivial constructors may throw on VS2012!
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#else
+#define MOODYCAMEL_NOEXCEPT noexcept
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
+#endif
+#endif
+
+#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#else
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
+// g++ <=4.7 doesn't support thread_local either.
+// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
+// Assume `thread_local` is fully supported in all other C++11 compilers/platforms
+//#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // always disabled for now since several users report having problems with it on
+#endif
+#endif
+#endif
+
+// VS2012 doesn't support deleted functions. 
+// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
+#ifndef MOODYCAMEL_DELETE_FUNCTION
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#define MOODYCAMEL_DELETE_FUNCTION
+#else
+#define MOODYCAMEL_DELETE_FUNCTION = delete
+#endif
+#endif
+
+// Compiler-specific likely/unlikely hints
+namespace moodycamel { namespace details {
+#if defined(__GNUC__)
+	static inline bool (likely)(bool x) { return __builtin_expect((x), true); }
+	static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); }
+#else
+	static inline bool (likely)(bool x) { return x; }
+	static inline bool (unlikely)(bool x) { return x; }
+#endif
+} }
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+#include "internal/concurrentqueue_internal_debug.h"
+#endif
+
+namespace moodycamel {
+namespace details {
+	template<typename T>
+	struct const_numeric_max {
+		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+		static const T value = std::numeric_limits<T>::is_signed
+			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
+			: static_cast<T>(-1);
+	};
+
+#if defined(__GLIBCXX__)
+	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+#else
+	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+#endif
+
+	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
+	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
+	typedef union {
+		std_max_align_t x;
+		long long y;
+		void* z;
+	} max_align_t;
+}
+
+// Default traits for the ConcurrentQueue. To change some of the
+// traits without re-implementing all of them, inherit from this
+// struct and shadow the declarations you wish to be different;
+// since the traits are used as a template type parameter, the
+// shadowed declarations will be used where defined, and the defaults
+// otherwise.
+struct ConcurrentQueueDefaultTraits
+{
+	// General-purpose size type. std::size_t is strongly recommended.
+	typedef std::size_t size_t;
+	
+	// The type used for the enqueue and dequeue indices. Must be at least as
+	// large as size_t. Should be significantly larger than the number of elements
+	// you expect to hold at once, especially if you have a high turnover rate;
+	// for example, on 32-bit x86, if you expect to have over a hundred million
+	// elements or pump several million elements through your queue in a very
+	// short space of time, using a 32-bit type *may* trigger a race condition.
+	// A 64-bit int type is recommended in that case, and in practice will
+	// prevent a race condition no matter the usage of the queue. Note that
+	// whether the queue is lock-free with a 64-int type depends on the whether
+	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+	typedef std::size_t index_t;
+	
+	// Internally, all elements are enqueued and dequeued from multi-element
+	// blocks; this is the smallest controllable unit. If you expect few elements
+	// but many producers, a smaller block size should be favoured. For few producers
+	// and/or many elements, a larger block size is preferred. A sane default
+	// is provided. Must be a power of 2.
+	static const size_t BLOCK_SIZE = 32;
+	
+	// For explicit producers (i.e. when using a producer token), the block is
+	// checked for being empty by iterating through a list of flags, one per element.
+	// For large block sizes, this is too inefficient, and switching to an atomic
+	// counter-based approach is faster. The switch is made for block sizes strictly
+	// larger than this threshold.
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+	
+	// How many full blocks can be expected for a single explicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// How many full blocks can be expected for a single implicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// The initial size of the hash table mapping thread IDs to implicit producers.
+	// Note that the hash is resized every time it becomes half full.
+	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
+	// (using the enqueue methods without an explicit producer token) is disabled.
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+	
+	// Controls the number of items that an explicit consumer (i.e. one with a token)
+	// must consume before it causes all consumers to rotate and move on to the next
+	// internal queue.
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+	
+	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+	// Enqueue operations that would cause this limit to be surpassed will fail. Note
+	// that this limit is enforced at the block level (for performance reasons), i.e.
+	// it's rounded up to the nearest block size.
+	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+	
+	
+#ifndef MCDBGQ_USE_RELACY
+	// Memory allocation can be customized if needed.
+	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+#if defined(malloc) || defined(free)
+	// Gah, this is 2015, stop defining macros that break standard code already!
+	// Work around malloc/free being special macros:
+	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
+	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
+	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
+	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
+#else
+	static inline void* malloc(size_t size) { return std::malloc(size); }
+	static inline void free(void* ptr) { return std::free(ptr); }
+#endif
+#else
+	// Debug versions when running under the Relacy race detector (ignore
+	// these in user code)
+	static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
+	static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+#endif
+};
+
+
+// When producing or consuming many elements, the most efficient way is to:
+//    1) Use one of the bulk-operation methods of the queue with a token
+//    2) Failing that, use the bulk-operation methods without a token
+//    3) Failing that, create a token and use that with the single-item methods
+//    4) Failing that, use the single-parameter methods of the queue
+// Having said that, don't create tokens willy-nilly -- ideally there should be
+// a maximum of one token per thread (of each kind).
+struct ProducerToken;
+struct ConsumerToken;
+
+template<typename T, typename Traits> class ConcurrentQueue;
+template<typename T, typename Traits> class BlockingConcurrentQueue;
+class ConcurrentQueueTests;
+
+
+namespace details
+{
+	struct ConcurrentQueueProducerTypelessBase
+	{
+		ConcurrentQueueProducerTypelessBase* next;
+		std::atomic<bool> inactive;
+		ProducerToken* token;
+		
+		ConcurrentQueueProducerTypelessBase()
+			: next(nullptr), inactive(false), token(nullptr)
+		{
+		}
+	};
+	
+	template<bool use32> struct _hash_32_or_64 {
+		static inline std::uint32_t hash(std::uint32_t h)
+		{
+			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+			// Since the thread ID is already unique, all we really want to do is propagate that
+			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
+			// reducing collisions significantly
+			h ^= h >> 16;
+			h *= 0x85ebca6b;
+			h ^= h >> 13;
+			h *= 0xc2b2ae35;
+			return h ^ (h >> 16);
+		}
+	};
+	template<> struct _hash_32_or_64<1> {
+		static inline std::uint64_t hash(std::uint64_t h)
+		{
+			h ^= h >> 33;
+			h *= 0xff51afd7ed558ccd;
+			h ^= h >> 33;
+			h *= 0xc4ceb9fe1a85ec53;
+			return h ^ (h >> 33);
+		}
+	};
+	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
+	
+	static inline size_t hash_thread_id(thread_id_t id)
+	{
+		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
+		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
+			thread_id_converter<thread_id_t>::prehash(id)));
+	}
+	
+	template<typename T>
+	static inline bool circular_less_than(T a, T b)
+	{
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4554)
+#endif
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
+		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << static_cast<T>(sizeof(T) * CHAR_BIT - 1));
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+	}
+	
+	template<typename U>
+	static inline char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+
+	template<typename T>
+	static inline T ceil_to_pow_2(T x)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+	
+	template<typename T>
+	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+	{
+		T temp = std::move(left.load(std::memory_order_relaxed));
+		left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed);
+		right.store(std::move(temp), std::memory_order_relaxed);
+	}
+	
+	template<typename T>
+	static inline T const& nomove(T const& x)
+	{
+		return x;
+	}
+	
+	template<bool Enable>
+	struct nomove_if
+	{
+		template<typename T>
+		static inline T const& eval(T const& x)
+		{
+			return x;
+		}
+	};
+	
+	template<>
+	struct nomove_if<false>
+	{
+		template<typename U>
+		static inline auto eval(U&& x)
+			-> decltype(std::forward<U>(x))
+		{
+			return std::forward<U>(x);
+		}
+	};
+	
+	template<typename It>
+	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
+	{
+		return *it;
+	}
+	
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+#else
+	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+#endif
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+	typedef RelacyThreadExitListener ThreadExitListener;
+	typedef RelacyThreadExitNotifier ThreadExitNotifier;
+#else
+	struct ThreadExitListener
+	{
+		typedef void (*callback_t)(void*);
+		callback_t callback;
+		void* userData;
+		
+		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
+	};
+	
+	
+	class ThreadExitNotifier
+	{
+	public:
+		static void subscribe(ThreadExitListener* listener)
+		{
+			auto& tlsInst = instance();
+			listener->next = tlsInst.tail;
+			tlsInst.tail = listener;
+		}
+		
+		static void unsubscribe(ThreadExitListener* listener)
+		{
+			auto& tlsInst = instance();
+			ThreadExitListener** prev = &tlsInst.tail;
+			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+				if (ptr == listener) {
+					*prev = ptr->next;
+					break;
+				}
+				prev = &ptr->next;
+			}
+		}
+		
+	private:
+		ThreadExitNotifier() : tail(nullptr) { }
+		ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		~ThreadExitNotifier()
+		{
+			// This thread is about to exit, let everyone know!
+			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+				ptr->callback(ptr->userData);
+			}
+		}
+		
+		// Thread-local
+		static inline ThreadExitNotifier& instance()
+		{
+			static thread_local ThreadExitNotifier notifier;
+			return notifier;
+		}
+		
+	private:
+		ThreadExitListener* tail;
+	};
+#endif
+#endif
+	
+	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
+	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
+	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
+	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
+	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
+}
+
+
+struct ProducerToken
+{
+	template<typename T, typename Traits>
+	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+	
+	template<typename T, typename Traits>
+	explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
+	
+	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+		: producer(other.producer)
+	{
+		other.producer = nullptr;
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+	}
+	
+	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(producer, other.producer);
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+		if (other.producer != nullptr) {
+			other.producer->token = &other;
+		}
+	}
+	
+	// A token is always valid unless:
+	//     1) Memory allocation failed during construction
+	//     2) It was moved via the move constructor
+	//        (Note: assignment does a swap, leaving both potentially valid)
+	//     3) The associated queue was destroyed
+	// Note that if valid() returns true, that only indicates
+	// that the token is valid for use with a specific queue,
+	// but not which one; that's up to the user to track.
+	inline bool valid() const { return producer != nullptr; }
+	
+	~ProducerToken()
+	{
+		if (producer != nullptr) {
+			producer->token = nullptr;
+			producer->inactive.store(true, std::memory_order_release);
+		}
+	}
+	
+	// Disable copying and assignment
+	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+protected:
+	details::ConcurrentQueueProducerTypelessBase* producer;
+};
+
+
+struct ConsumerToken
+{
+	template<typename T, typename Traits>
+	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+	
+	template<typename T, typename Traits>
+	explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
+	
+	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
+	{
+	}
+	
+	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(initialOffset, other.initialOffset);
+		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+		std::swap(currentProducer, other.currentProducer);
+		std::swap(desiredProducer, other.desiredProducer);
+	}
+	
+	// Disable copying and assignment
+	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+private: // but shared with ConcurrentQueue
+	std::uint32_t initialOffset;
+	std::uint32_t lastKnownGlobalOffset;
+	std::uint32_t itemsConsumedFromCurrent;
+	details::ConcurrentQueueProducerTypelessBase* currentProducer;
+	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+};
+
+// Need to forward-declare this swap because it's in a namespace.
+// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+
+
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue
+{
+public:
+	typedef ::moodycamel::ProducerToken producer_token_t;
+	typedef ::moodycamel::ConsumerToken consumer_token_t;
+	
+	typedef typename Traits::index_t index_t;
+	typedef typename Traits::size_t size_t;
+	
+	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
+#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#endif
+	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
+	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
+	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
+
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		// Track all the producers using a fully-resolved typed list for
+		// each kind; this makes it possible to debug them starting from
+		// the root queue object (otherwise wacky casts are needed that
+		// don't compile in the debugger's expression evaluator).
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Computes the correct amount of pre-allocated blocks for you based
+	// on the minimum number of elements you want available at any given
+	// time, and the maximum concurrent number of each type of producer.
+	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
+		populate_initial_block_list(blocks);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	// This method is not thread safe.
+	~ConcurrentQueue()
+	{
+		// Destroy producers
+		auto ptr = producerListTail.load(std::memory_order_relaxed);
+		while (ptr != nullptr) {
+			auto next = ptr->next_prod();
+			if (ptr->token != nullptr) {
+				ptr->token->producer = nullptr;
+			}
+			destroy(ptr);
+			ptr = next;
+		}
+		
+		// Destroy implicit producer hash tables
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+			while (hash != nullptr) {
+				auto prev = hash->prev;
+				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
+					for (size_t i = 0; i != hash->capacity; ++i) {
+						hash->entries[i].~ImplicitProducerKVP();
+					}
+					hash->~ImplicitProducerHash();
+					(Traits::free)(hash);
+				}
+				hash = prev;
+			}
+		}
+		
+		// Destroy global free list
+		auto block = freeList.head_unsafe();
+		while (block != nullptr) {
+			auto next = block->freeListNext.load(std::memory_order_relaxed);
+			if (block->dynamicallyAllocated) {
+				destroy(block);
+			}
+			block = next;
+		}
+		
+		// Destroy initial free list
+		destroy_array(initialBlockPool, initialBlockPoolSize);
+	}
+
+	// Disable copying and copy assignment
+	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
+		producerCount(other.producerCount.load(std::memory_order_relaxed)),
+		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+		initialBlockPool(other.initialBlockPool),
+		initialBlockPoolSize(other.initialBlockPoolSize),
+		freeList(std::move(other.freeList)),
+		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+	{
+		// Move the other one into this, and leave the other one as an empty queue
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		swap_implicit_producer_hashes(other);
+		
+		other.producerListTail.store(nullptr, std::memory_order_relaxed);
+		other.producerCount.store(0, std::memory_order_relaxed);
+		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+		
+		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+		other.initialBlockPoolSize = 0;
+		other.initialBlockPool = nullptr;
+		
+		reown_producers();
+	}
+	
+	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+	
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+	
+private:
+	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+		
+		details::swap_relaxed(producerListTail, other.producerListTail);
+		details::swap_relaxed(producerCount, other.producerCount);
+		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+		std::swap(initialBlockPool, other.initialBlockPool);
+		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+		freeList.swap(other.freeList);
+		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
+		
+		swap_implicit_producer_hashes(other);
+		
+		reown_producers();
+		other.reown_producers();
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		details::swap_relaxed(explicitProducers, other.explicitProducers);
+		details::swap_relaxed(implicitProducers, other.implicitProducers);
+#endif
+		
+		return *this;
+	}
+	
+public:
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CanAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CanAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CanAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CanAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+	}
+	
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CannotAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CannotAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+	}
+	
+	
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(U& item)
+	{
+		// Instead of simply trying each producer in turn (which could cause needless contention on the first
+		// producer), we score them heuristically.
+		size_t nonEmptyCount = 0;
+		ProducerBase* best = nullptr;
+		size_t bestSize = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
+			auto size = ptr->size_approx();
+			if (size > 0) {
+				if (size > bestSize) {
+					bestSize = size;
+					best = ptr;
+				}
+				++nonEmptyCount;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (nonEmptyCount > 0) {
+			if ((details::likely)(best->dequeue(item))) {
+				return true;
+			}
+			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+				if (ptr != best && ptr->dequeue(item)) {
+					return true;
+				}
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// This differs from the try_dequeue(item) method in that this one does
+	// not attempt to reduce contention by interleaving the order that producer
+	// streams are dequeued from. So, using this method can reduce overall throughput
+	// under contention, but will give more predictable results in single-threaded
+	// consumer scenarios. This is mostly only useful for internal unit tests.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue_non_interleaved(U& item)
+	{
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->dequeue(item)) {
+				return true;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		// The idea is roughly as follows:
+		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
+		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
+		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
+		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
+		
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return false;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
+			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return true;
+		}
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			if (ptr->dequeue(item)) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = 1;
+				return true;
+			}
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			count += ptr->dequeue_bulk(itemFirst, max - count);
+			if (count == max) {
+				break;
+			}
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return 0;
+			}
+		}
+		
+		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+		if (count == max) {
+			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return max;
+		}
+		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+		max -= count;
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+			count += dequeued;
+			if (dequeued != 0) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+			}
+			if (dequeued == max) {
+				break;
+			}
+			max -= dequeued;
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return count;
+	}
+	
+	
+	
+	// Attempts to dequeue from a specific producer's inner queue.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns false if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
+	}
+	
+	// Attempts to dequeue several elements from a specific producer's inner queue.
+	// Returns the number of items actually dequeued.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns 0 if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
+	}
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	size_t size_approx() const
+	{
+		size_t size = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			size += ptr->size_approx();
+		}
+		return size;
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static bool is_lock_free()
+	{
+		return
+			details::static_is_lock_free<bool>::value == 2 &&
+			details::static_is_lock_free<size_t>::value == 2 &&
+			details::static_is_lock_free<std::uint32_t>::value == 2 &&
+			details::static_is_lock_free<index_t>::value == 2 &&
+			details::static_is_lock_free<void*>::value == 2 &&
+			details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+	}
+
+
+private:
+	friend struct ProducerToken;
+	friend struct ConsumerToken;
+	struct ExplicitProducer;
+	friend struct ExplicitProducer;
+	struct ImplicitProducer;
+	friend struct ImplicitProducer;
+	friend class ConcurrentQueueTests;
+		
+	enum AllocationMode { CanAlloc, CannotAlloc };
+	
+	
+	///////////////////////////////
+	// Queue methods
+	///////////////////////////////
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(producer_token_t const& token, U&& element)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(U&& element)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	inline bool update_current_producer_after_rotation(consumer_token_t& token)
+	{
+		// Ah, there's been a rotation, figure out where we should be!
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		if (token.desiredProducer == nullptr && tail == nullptr) {
+			return false;
+		}
+		auto prodCount = producerCount.load(std::memory_order_relaxed);
+		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+		if ((details::unlikely)(token.desiredProducer == nullptr)) {
+			// Aha, first time we're dequeueing anything.
+			// Figure out our local position
+			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+			token.desiredProducer = tail;
+			for (std::uint32_t i = 0; i != offset; ++i) {
+				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+				if (token.desiredProducer == nullptr) {
+					token.desiredProducer = tail;
+				}
+			}
+		}
+		
+		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+		if (delta >= prodCount) {
+			delta = delta % prodCount;
+		}
+		for (std::uint32_t i = 0; i != delta; ++i) {
+			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+			if (token.desiredProducer == nullptr) {
+				token.desiredProducer = tail;
+			}
+		}
+		
+		token.lastKnownGlobalOffset = globalOffset;
+		token.currentProducer = token.desiredProducer;
+		token.itemsConsumedFromCurrent = 0;
+		return true;
+	}
+	
+	
+	///////////////////////////
+	// Free list
+	///////////////////////////
+	
+	template <typename N>
+	struct FreeListNode
+	{
+		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
+		
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<N*> freeListNext;
+	};
+	
+	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+	// speedy under low contention.
+	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
+	struct FreeList
+	{
+		FreeList() : freeListHead(nullptr) { }
+		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
+		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
+		
+		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		inline void add(N* node)
+		{
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+			// set it using a fetch_add
+			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
+				// Oh look! We were the last ones referencing this node, and we know
+				// we want to add it to the free list, so let's do it!
+		 		add_knowing_refcount_is_zero(node);
+			}
+		}
+		
+		inline N* try_get()
+		{
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			auto head = freeListHead.load(std::memory_order_acquire);
+			while (head != nullptr) {
+				auto prevHead = head;
+				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+					head = freeListHead.load(std::memory_order_acquire);
+					continue;
+				}
+				
+				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
+				// next and not worry about it changing between now and the time we do the CAS
+				auto next = head->freeListNext.load(std::memory_order_relaxed);
+				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+					
+					// Decrease refcount twice, once for our ref, and once for the list's ref
+					head->freeListRefs.fetch_sub(2, std::memory_order_release);
+					return head;
+				}
+				
+				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
+				// count decrement happens-after the CAS on the head.
+				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+				if (refs == SHOULD_BE_ON_FREELIST + 1) {
+					add_knowing_refcount_is_zero(prevHead);
+				}
+			}
+			
+			return nullptr;
+		}
+		
+		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
+		
+	private:
+		inline void add_knowing_refcount_is_zero(N* node)
+		{
+			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+			// only one copy of this method per node at a time, i.e. the single thread case), then we know
+			// we can safely change the next pointer of the node; however, once the refcount is back above
+			// zero, then other threads could increase it (happens under heavy contention, when the refcount
+			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
+			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
+			auto head = freeListHead.load(std::memory_order_relaxed);
+			while (true) {
+				node->freeListNext.store(head, std::memory_order_relaxed);
+				node->freeListRefs.store(1, std::memory_order_release);
+				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
+					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
+					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) {
+						continue;
+					}
+				}
+				return;
+			}
+		}
+		
+	private:
+		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+		std::atomic<N*> freeListHead;
+	
+	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+		
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+		debug::DebugMutex mutex;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Block
+	///////////////////////////
+	
+	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
+	
+	struct Block
+	{
+		Block()
+			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true)
+		{
+#ifdef MCDBGQ_TRACKMEM
+			owner = nullptr;
+#endif
+		}
+		
+		template<InnerQueueContext context>
+		inline bool is_empty() const
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Check flags
+				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+						return false;
+					}
+				}
+				
+				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+				std::atomic_thread_fence(std::memory_order_acquire);
+				return true;
+			}
+			else {
+				// Check counter
+				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
+					std::atomic_thread_fence(std::memory_order_acquire);
+					return true;
+				}
+				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+				return false;
+			}
+		}
+		
+		// Returns true if the block is now empty (does not apply in explicit context)
+		template<InnerQueueContext context>
+		inline bool set_empty(index_t i)
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flag
+				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
+				assert(prevVal < BLOCK_SIZE);
+				return prevVal == BLOCK_SIZE - 1;
+			}
+		}
+		
+		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+		// Returns true if the block is now empty (does not apply in explicit context).
+		template<InnerQueueContext context>
+		inline bool set_many_empty(index_t i, size_t count)
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flags
+				std::atomic_thread_fence(std::memory_order_release);
+				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+				for (size_t j = 0; j != count; ++j) {
+					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+					emptyFlags[i + j].store(true, std::memory_order_relaxed);
+				}
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release);
+				assert(prevVal + count <= BLOCK_SIZE);
+				return prevVal + count == BLOCK_SIZE;
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void set_all_empty()
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set all flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(true, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void reset_empty()
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Reset flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(false, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+			}
+		}
+		
+		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		
+	private:
+		// IMPORTANT: This must be the first member in Block, so that if T depends on the alignment of
+		// addresses returned by malloc, that alignment will be preserved. Apparently clang actually
+		// generates code that uses this assumption for AVX instructions in some cases. Ideally, we
+		// should also align Block to the alignment of T in case it's higher than malloc's 16-byte
+		// alignment, but this is hard to do in a cross-platform way. Assert for this case:
+		static_assert(std::alignment_of<T>::value <= std::alignment_of<details::max_align_t>::value, "The queue does not support super-aligned types at this time");
+		// Additionally, we need the alignment of Block itself to be a multiple of max_align_t since
+		// otherwise the appropriate padding will not be added at the end of Block in order to make
+		// arrays of Blocks all be properly aligned (not just the first one). We use a union to force
+		// this.
+		union {
+			char elements[sizeof(T) * BLOCK_SIZE];
+			details::max_align_t dummy;
+		};
+	public:
+		Block* next;
+		std::atomic<size_t> elementsCompletelyDequeued;
+		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+	public:
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<Block*> freeListNext;
+		std::atomic<bool> shouldBeOnFreeList;
+		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+		
+#ifdef MCDBGQ_TRACKMEM
+		void* owner;
+#endif
+	};
+	static_assert(std::alignment_of<Block>::value >= std::alignment_of<details::max_align_t>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+
+
+#ifdef MCDBGQ_TRACKMEM
+public:
+	struct MemStats;
+private:
+#endif
+	
+	///////////////////////////
+	// Producer base
+	///////////////////////////
+	
+	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+	{
+		ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) :
+			tailIndex(0),
+			headIndex(0),
+			dequeueOptimisticCount(0),
+			dequeueOvercommit(0),
+			tailBlock(nullptr),
+			isExplicit(isExplicit_),
+			parent(parent_)
+		{
+		}
+		
+		virtual ~ProducerBase() { };
+		
+		template<typename U>
+		inline bool dequeue(U& element)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue(element);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue(element);
+			}
+		}
+		
+		template<typename It>
+		inline size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+		}
+		
+		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
+		
+		inline size_t size_approx() const
+		{
+			auto tail = tailIndex.load(std::memory_order_relaxed);
+			auto head = headIndex.load(std::memory_order_relaxed);
+			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+		}
+		
+		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
+	protected:
+		std::atomic<index_t> tailIndex;		// Where to enqueue to next
+		std::atomic<index_t> headIndex;		// Where to dequeue from next
+		
+		std::atomic<index_t> dequeueOptimisticCount;
+		std::atomic<index_t> dequeueOvercommit;
+		
+		Block* tailBlock;
+		
+	public:
+		bool isExplicit;
+		ConcurrentQueue* parent;
+		
+	protected:
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Explicit queue
+	///////////////////////////
+		
+	struct ExplicitProducer : public ProducerBase
+	{
+		explicit ExplicitProducer(ConcurrentQueue* parent) :
+			ProducerBase(parent, true),
+			blockIndex(nullptr),
+			pr_blockIndexSlotsUsed(0),
+			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+			pr_blockIndexFront(0),
+			pr_blockIndexEntries(nullptr),
+			pr_blockIndexRaw(nullptr)
+		{
+			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent->initialBlockPoolSize) >> 1;
+			if (poolBasedIndexSize > pr_blockIndexSize) {
+				pr_blockIndexSize = poolBasedIndexSize;
+			}
+			
+			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+		}
+		
+		~ExplicitProducer()
+		{
+			// Destruct any elements not yet dequeued.
+			// Since we're in the destructor, we can assume all elements
+			// are either completely dequeued or completely not (no halfways).
+			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
+				// First find the block that's partially dequeued, if any
+				Block* halfDequeuedBlock = nullptr;
+				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
+					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
+					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
+						i = (i + 1) & (pr_blockIndexSize - 1);
+					}
+					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
+					halfDequeuedBlock = pr_blockIndexEntries[i].block;
+				}
+				
+				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+				auto block = this->tailBlock;
+				do {
+					block = block->next;
+					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+						continue;
+					}
+					
+					size_t i = 0;	// Offset into block
+					if (block == halfDequeuedBlock) {
+						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					}
+					
+					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
+						(*block)[i++]->~T();
+					}
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy all blocks that we own
+			if (this->tailBlock != nullptr) {
+				auto block = this->tailBlock;
+				do {
+					auto nextBlock = block->next;
+					if (block->dynamicallyAllocated) {
+						destroy(block);
+					}
+					else {
+						this->parent->add_block_to_free_list(block);
+					}
+					block = nextBlock;
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy the block indices
+			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+			while (header != nullptr) {
+				auto prev = static_cast<BlockIndexHeader*>(header->prev);
+				header->~BlockIndexHeader();
+				(Traits::free)(header);
+				header = prev;
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto startBlock = this->tailBlock;
+				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					// We can re-use the block ahead of us, it's empty!					
+					this->tailBlock = this->tailBlock->next;
+					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					
+					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+					// last block from it first -- except instead of removing then adding, we can just overwrite).
+					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
+					// it would have been re-attempted when adding the first block to the queue; since there is such
+					// a block, a block index must have been successfully allocated.
+				}
+				else {
+					// Whatever head value we see here is >= the last value we saw here (relatively),
+					// and <= its current value. Since we have the most recent tail, the head must be
+					// <= to it.
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
+						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+						// We can't enqueue in another block because there's not enough leeway -- the
+						// tail could surpass the head by the time the block fills up! (Or we'll exceed
+						// the size limit, if the second part of the condition was true.)
+						return false;
+					}
+					// We're going to need a new block; check that the block index has room
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+						// Hmm, the circular block index is already full -- we'll need
+						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+						// the initial allocation failed in the constructor.
+						
+						if (allocMode == CannotAlloc || !new_block_index(pr_blockIndexSlotsUsed)) {
+							return false;
+						}
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						return false;
+					}
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					++pr_blockIndexSlotsUsed;
+				}
+
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward<U>(element)))) {
+					// The constructor may throw. We want the element not to appear in the queue in
+					// that case (without corrupting the queue):
+					MOODYCAMEL_TRY {
+						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Revert change to the current block, but leave the new block available
+						// for next time
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				else {
+					(void)startBlock;
+					(void)originalBlockIndexSlotsUsed;
+				}
+				
+				// Add block to block index
+				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+				entry.base = currentTailIndex;
+				entry.block = this->tailBlock;
+				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				// Might be something to dequeue, let's give it a try
+				
+				// Note that this if is purely for performance purposes in the common case when the queue is
+				// empty and the values are eventually consistent -- we may enter here spuriously.
+				
+				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
+				// change them) and must be the same value at this point (inside the if) as when the if condition was
+				// evaluated.
+
+				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
+				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
+				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
+				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
+				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
+				// unfortunately that can't be shown to be correct using only the C++11 standard.
+				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				// Increment optimistic counter, then check if it went over the boundary
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				
+				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
+				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
+				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
+				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
+				// However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
+				// overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
+				
+				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
+				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
+				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					// Guaranteed to be at least one element to dequeue!
+					
+					// Get the index. Note that since there's guaranteed to be at least one element, this
+					// will never exceed tail. We need to do an acquire-release fence here since it's possible
+					// that whatever condition got us to this point was for an earlier enqueued element (that
+					// we already see the memory effects for), but that by the time we increment somebody else
+					// has incremented it, and we need to see the memory effects for *that* element, which is
+					// in such a case is necessarily visible on the thread that incremented it in the first
+					// place with the more current condition (they must have acquired a tail that is at least
+					// as recent).
+					auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					
+					// Determine which block the element is in
+					
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					// We need to be careful here about subtracting and dividing because of index wrap-around.
+					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
+					// block size (in order to get a correct signed block count offset in all cases):
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / BLOCK_SIZE);
+					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
+					
+					// Dequeue
+					auto& el = *((*block)[index]);
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+						// Make sure the element is still fully dequeued and destroyed even if the assignment
+						// throws
+						struct Guard {
+							Block* block;
+							index_t index;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+							}
+						} guard = { block, index };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+					}
+					
+					return true;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
+				}
+			}
+		
+			return false;
+		}
+		
+		template<AllocationMode allocMode, typename It>
+		bool enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			auto originalBlockIndexFront = pr_blockIndexFront;
+			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+			
+			Block* firstAllocatedBlock = nullptr;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+				// Allocate as many blocks as possible from ahead
+				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					this->tailBlock = this->tailBlock->next;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Now allocate as many blocks as necessary from the block pool
+				while (blockBaseDiff > 0) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+						if (allocMode == CannotAlloc || full || !new_block_index(originalBlockIndexSlotsUsed)) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						
+						// pr_blockIndexFront is updated inside new_block_index, so we need to
+						// update our fallback value too (since we keep the new index even if we
+						// later fail)
+						originalBlockIndexFront = originalBlockIndexSlotsUsed;
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						return false;
+					}
+					
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					++pr_blockIndexSlotsUsed;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
+				// publish the new block index front
+				auto block = firstAllocatedBlock;
+				while (true) {
+					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (block == this->tailBlock) {
+						break;
+					}
+					block = block->next;
+				}
+				
+				if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) {
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+				}
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			auto endBlock = this->tailBlock;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				auto stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							// Must use copy constructor even if move constructor is available
+							// because we may have to revert if there's an exception.
+							// Sorry about the horrible templated next line, but it was the only way
+							// to disable moving *at compile time*, which is important because a type
+							// may only define a (noexcept) move constructor, and so calls to the
+							// cctor will not compile, even if they are in an if branch that will never
+							// be executed
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Oh dear, an exception's been thrown -- destroy the elements that
+						// were enqueued so far and revert the entire bulk operation (we'll keep
+						// any allocated blocks in our linked list for later, though).
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			
+			if (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst))) && firstAllocatedBlock != nullptr) {
+				blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+			}
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);;
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Determine which block the first element is in
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE);
+					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					do {
+						auto firstIndexInBlock = index;
+						auto endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						auto block = localBlockIndex->entries[indexIndex].block;
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								// It's too late to revert the dequeue, but we can make sure that all
+								// the dequeued objects are properly destroyed and the block index
+								// (and empty count) are properly updated before we propagate the exception
+								do {
+									block = localBlockIndex->entries[indexIndex].block;
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+									
+									firstIndexInBlock = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		struct BlockIndexEntry
+		{
+			index_t base;
+			Block* block;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t size;
+			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
+			BlockIndexEntry* entries;
+			void* prev;
+		};
+		
+		
+		bool new_block_index(size_t numberOfFilledSlotsToExpose)
+		{
+			auto prevBlockSizeMask = pr_blockIndexSize - 1;
+			
+			// Create the new block
+			pr_blockIndexSize <<= 1;
+			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+			if (newRawPtr == nullptr) {
+				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
+				return false;
+			}
+			
+			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+			
+			// Copy in all the old indices, if any
+			size_t j = 0;
+			if (pr_blockIndexSlotsUsed != 0) {
+				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+				do {
+					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+					i = (i + 1) & prevBlockSizeMask;
+				} while (i != pr_blockIndexFront);
+			}
+			
+			// Update everything
+			auto header = new (newRawPtr) BlockIndexHeader;
+			header->size = pr_blockIndexSize;
+			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+			header->entries = newBlockIndexEntries;
+			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
+			
+			pr_blockIndexFront = j;
+			pr_blockIndexEntries = newBlockIndexEntries;
+			pr_blockIndexRaw = newRawPtr;
+			blockIndex.store(header, std::memory_order_release);
+			
+			return true;
+		}
+		
+	private:
+		std::atomic<BlockIndexHeader*> blockIndex;
+		
+		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
+		size_t pr_blockIndexSlotsUsed;
+		size_t pr_blockIndexSize;
+		size_t pr_blockIndexFront;		// Next slot (not current)
+		BlockIndexEntry* pr_blockIndexEntries;
+		void* pr_blockIndexRaw;
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ExplicitProducer* nextExplicitProducer;
+	private:
+#endif
+		
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Implicit queue
+	//////////////////////////////////
+	
+	struct ImplicitProducer : public ProducerBase
+	{			
+		ImplicitProducer(ConcurrentQueue* parent) :
+			ProducerBase(parent, false),
+			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
+			blockIndex(nullptr)
+		{
+			new_block_index();
+		}
+		
+		~ImplicitProducer()
+		{
+			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
+			// completed already; this means that all undequeued elements are placed contiguously across
+			// contiguous blocks, and that only the first and last remaining blocks can be only partially
+			// empty (all other remaining blocks must be completely full).
+			
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+			// Unregister ourselves for thread termination notification
+			if (!this->inactive.load(std::memory_order_relaxed)) {
+				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+			}
+#endif
+			
+			// Destroy all remaining elements!
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto index = this->headIndex.load(std::memory_order_relaxed);
+			Block* block = nullptr;
+			assert(index == tail || details::circular_less_than(index, tail));
+			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
+			while (index != tail) {
+				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
+					if (block != nullptr) {
+						// Free the old block
+						this->parent->add_block_to_free_list(block);
+					}
+					
+					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
+				}
+				
+				((*block)[index])->~T();
+				++index;
+			}
+			// Even if the queue is empty, there's still one block that's not on the free list
+			// (unless the head index reached the end of it, in which case the tail will be poised
+			// to create a new block).
+			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
+				this->parent->add_block_to_free_list(this->tailBlock);
+			}
+			
+			// Destroy block index
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			if (localBlockIndex != nullptr) {
+				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+					localBlockIndex->index[i]->~BlockIndexEntry();
+				}
+				do {
+					auto prev = localBlockIndex->prev;
+					localBlockIndex->~BlockIndexHeader();
+					(Traits::free)(localBlockIndex);
+					localBlockIndex = prev;
+				} while (localBlockIndex != nullptr);
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto head = this->headIndex.load(std::memory_order_relaxed);
+				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+					return false;
+				}
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				// Find out where we'll be inserting this block in the block index
+				BlockIndexEntry* idxEntry;
+				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+					return false;
+				}
+				
+				// Get ahold of a new block
+				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+				if (newBlock == nullptr) {
+					rewind_block_index_tail();
+					idxEntry->value.store(nullptr, std::memory_order_relaxed);
+					return false;
+				}
+#ifdef MCDBGQ_TRACKMEM
+				newBlock->owner = this;
+#endif
+				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+				
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward<U>(element)))) {
+					// May throw, try to insert now before we publish the fact that we have this new block
+					MOODYCAMEL_TRY {
+						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						rewind_block_index_tail();
+						idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						this->parent->add_block_to_free_list(newBlock);
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				// Insert the new block into the index
+				idxEntry->value.store(newBlock, std::memory_order_relaxed);
+				
+				this->tailBlock = newBlock;
+				
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			// See ExplicitProducer::dequeue for rationale and explanation
+			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					// Determine which block the element is in
+					auto entry = get_block_index_entry_for_index(index);
+					
+					// Dequeue
+					auto block = entry->value.load(std::memory_order_relaxed);
+					auto& el = *((*block)[index]);
+					
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+						// Note: Acquiring the mutex with every dequeue instead of only when a block
+						// is released is very sub-optimal, but it is, after all, purely debug code.
+						debug::DebugLock lock(producer->mutex);
+#endif
+						struct Guard {
+							Block* block;
+							index_t index;
+							BlockIndexEntry* entry;
+							ConcurrentQueue* parent;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+									entry->value.store(nullptr, std::memory_order_relaxed);
+									parent->add_block_to_free_list(block);
+								}
+							}
+						} guard = { block, index, entry, this->parent };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+
+						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+							{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Add the block back into the global free pool (and remove from block index)
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+					}
+					
+					return true;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+				}
+			}
+		
+			return false;
+		}
+		
+		template<AllocationMode allocMode, typename It>
+		bool enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			
+			// Note that the tailBlock we start off with may not be owned by us any more;
+			// this happens if it was filled up exactly to the top (setting tailIndex to
+			// the first index of the next block which is not yet allocated), then dequeued
+			// completely (putting it on the free list) before we enqueue again.
+			
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			Block* firstAllocatedBlock = nullptr;
+			auto endBlock = this->tailBlock;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				do {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					// Find out where we'll be inserting this block in the block index
+					BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
+					Block* newBlock;
+					bool indexInserted = false;
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
+						// Index allocation or block allocation failed; revert any other allocations
+						// and index insertions done so far for this operation
+						if (indexInserted) {
+							rewind_block_index_tail();
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						}
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						
+						return false;
+					}
+					
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+					newBlock->next = nullptr;
+					
+					// Insert the new block into the index
+					idxEntry->value.store(newBlock, std::memory_order_relaxed);
+					
+					// Store the chain of blocks so that we can undo if later allocations fail,
+					// and so that we can find the blocks when we do the actual enqueueing
+					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
+						assert(this->tailBlock != nullptr);
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					endBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+				} while (blockBaseDiff > 0);
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				auto stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					BlockIndexHeader* localBlockIndex;
+					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
+					do {
+						auto blockStartIndex = index;
+						auto endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						
+						auto entry = localBlockIndex->index[indexIndex];
+						auto block = entry->value.load(std::memory_order_relaxed);
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								do {
+									entry = localBlockIndex->index[indexIndex];
+									block = entry->value.load(std::memory_order_relaxed);
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									
+									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+										debug::DebugLock lock(mutex);
+#endif
+										entry->value.store(nullptr, std::memory_order_relaxed);
+										this->parent->add_block_to_free_list(block);
+									}
+									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+									
+									blockStartIndex = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+							{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
+								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		// The block size must be > 1, so any number with the low bit set is an invalid block base index
+		static const index_t INVALID_BLOCK_BASE = 1;
+		
+		struct BlockIndexEntry
+		{
+			std::atomic<index_t> key;
+			std::atomic<Block*> value;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t capacity;
+			std::atomic<size_t> tail;
+			BlockIndexEntry* entries;
+			BlockIndexEntry** index;
+			BlockIndexHeader* prev;
+		};
+		
+		template<AllocationMode allocMode>
+		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
+			if (localBlockIndex == nullptr) {
+				return false;  // this can happen if new_block_index failed in the constructor
+			}
+			auto newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+			idxEntry = localBlockIndex->index[newTail];
+			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+				
+				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+				localBlockIndex->tail.store(newTail, std::memory_order_release);
+				return true;
+			}
+			
+			// No room in the old block index, try to allocate another one!
+			if (allocMode == CannotAlloc || !new_block_index()) {
+				return false;
+			}
+			localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+			idxEntry = localBlockIndex->index[newTail];
+			assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
+			idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+			localBlockIndex->tail.store(newTail, std::memory_order_release);
+			return true;
+		}
+		
+		inline void rewind_block_index_tail()
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
+		}
+		
+		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
+		{
+			BlockIndexHeader* localBlockIndex;
+			auto idx = get_block_index_index_for_index(index, localBlockIndex);
+			return localBlockIndex->index[idx];
+		}
+		
+		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
+		{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+			debug::DebugLock lock(mutex);
+#endif
+			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+			localBlockIndex = blockIndex.load(std::memory_order_acquire);
+			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+			assert(tailBase != INVALID_BLOCK_BASE);
+			// Note: Must use division instead of shift because the index may wrap around, causing a negative
+			// offset, whose negativity we want to preserve
+			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / BLOCK_SIZE);
+			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
+			return idx;
+		}
+		
+		bool new_block_index()
+		{
+			auto prev = blockIndex.load(std::memory_order_relaxed);
+			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+			auto raw = static_cast<char*>((Traits::malloc)(
+				sizeof(BlockIndexHeader) +
+				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
+				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
+			if (raw == nullptr) {
+				return false;
+			}
+			
+			auto header = new (raw) BlockIndexHeader;
+			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
+			if (prev != nullptr) {
+				auto prevTail = prev->tail.load(std::memory_order_relaxed);
+				auto prevPos = prevTail;
+				size_t i = 0;
+				do {
+					prevPos = (prevPos + 1) & (prev->capacity - 1);
+					index[i++] = prev->index[prevPos];
+				} while (prevPos != prevTail);
+				assert(i == prevCapacity);
+			}
+			for (size_t i = 0; i != entryCount; ++i) {
+				new (entries + i) BlockIndexEntry;
+				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+				index[prevCapacity + i] = entries + i;
+			}
+			header->prev = prev;
+			header->entries = entries;
+			header->index = index;
+			header->capacity = nextBlockIndexCapacity;
+			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
+			
+			blockIndex.store(header, std::memory_order_release);
+			
+			nextBlockIndexCapacity <<= 1;
+			
+			return true;
+		}
+		
+	private:
+		size_t nextBlockIndexCapacity;
+		std::atomic<BlockIndexHeader*> blockIndex;
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	public:
+		details::ThreadExitListener threadExitListener;
+	private:
+#endif
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ImplicitProducer* nextImplicitProducer;
+	private:
+#endif
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+		mutable debug::DebugMutex mutex;
+#endif
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Block pool manipulation
+	//////////////////////////////////
+	
+	void populate_initial_block_list(size_t blockCount)
+	{
+		initialBlockPoolSize = blockCount;
+		if (initialBlockPoolSize == 0) {
+			initialBlockPool = nullptr;
+			return;
+		}
+		
+		initialBlockPool = create_array<Block>(blockCount);
+		if (initialBlockPool == nullptr) {
+			initialBlockPoolSize = 0;
+		}
+		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+			initialBlockPool[i].dynamicallyAllocated = false;
+		}
+	}
+	
+	inline Block* try_get_block_from_initial_pool()
+	{
+		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
+			return nullptr;
+		}
+		
+		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+		
+		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+	}
+	
+	inline void add_block_to_free_list(Block* block)
+	{
+#ifdef MCDBGQ_TRACKMEM
+		block->owner = nullptr;
+#endif
+		freeList.add(block);
+	}
+	
+	inline void add_blocks_to_free_list(Block* block)
+	{
+		while (block != nullptr) {
+			auto next = block->next;
+			add_block_to_free_list(block);
+			block = next;
+		}
+	}
+	
+	inline Block* try_get_block_from_free_list()
+	{
+		return freeList.try_get();
+	}
+	
+	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+	template<AllocationMode canAlloc>
+	Block* requisition_block()
+	{
+		auto block = try_get_block_from_initial_pool();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		block = try_get_block_from_free_list();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		if (canAlloc == CanAlloc) {
+			return create<Block>();
+		}
+		
+		return nullptr;
+	}
+	
+
+#ifdef MCDBGQ_TRACKMEM
+	public:
+		struct MemStats {
+			size_t allocatedBlocks;
+			size_t usedBlocks;
+			size_t freeBlocks;
+			size_t ownedBlocksExplicit;
+			size_t ownedBlocksImplicit;
+			size_t implicitProducers;
+			size_t explicitProducers;
+			size_t elementsEnqueued;
+			size_t blockClassBytes;
+			size_t queueClassBytes;
+			size_t implicitBlockIndexBytes;
+			size_t explicitBlockIndexBytes;
+			
+			friend class ConcurrentQueue;
+			
+		private:
+			static MemStats getFor(ConcurrentQueue* q)
+			{
+				MemStats stats = { 0 };
+				
+				stats.elementsEnqueued = q->size_approx();
+			
+				auto block = q->freeList.head_unsafe();
+				while (block != nullptr) {
+					++stats.allocatedBlocks;
+					++stats.freeBlocks;
+					block = block->freeListNext.load(std::memory_order_relaxed);
+				}
+				
+				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
+					stats.implicitProducers += implicit ? 1 : 0;
+					stats.explicitProducers += implicit ? 0 : 1;
+					
+					if (implicit) {
+						auto prod = static_cast<ImplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ImplicitProducer);
+						auto head = prod->headIndex.load(std::memory_order_relaxed);
+						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+						if (hash != nullptr) {
+							for (size_t i = 0; i != hash->capacity; ++i) {
+								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
+									++stats.allocatedBlocks;
+									++stats.ownedBlocksImplicit;
+								}
+							}
+							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
+							for (; hash != nullptr; hash = hash->prev) {
+								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
+							}
+						}
+						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
+							//auto block = prod->get_block_index_entry_for_index(head);
+							++stats.usedBlocks;
+						}
+					}
+					else {
+						auto prod = static_cast<ExplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ExplicitProducer);
+						auto tailBlock = prod->tailBlock;
+						bool wasNonEmpty = false;
+						if (tailBlock != nullptr) {
+							auto block = tailBlock;
+							do {
+								++stats.allocatedBlocks;
+								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
+									++stats.usedBlocks;
+									wasNonEmpty = wasNonEmpty || block != tailBlock;
+								}
+								++stats.ownedBlocksExplicit;
+								block = block->next;
+							} while (block != tailBlock);
+						}
+						auto index = prod->blockIndex.load(std::memory_order_relaxed);
+						while (index != nullptr) {
+							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
+							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
+						}
+					}
+				}
+				
+				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+				stats.allocatedBlocks += freeOnInitialPool;
+				stats.freeBlocks += freeOnInitialPool;
+				
+				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+				stats.queueClassBytes += sizeof(ConcurrentQueue);
+				
+				return stats;
+			}
+		};
+		
+		// For debugging only. Not thread-safe.
+		MemStats getMemStats()
+		{
+			return MemStats::getFor(this);
+		}
+	private:
+		friend struct MemStats;
+#endif
+	
+	
+	//////////////////////////////////
+	// Producer list manipulation
+	//////////////////////////////////	
+	
+	ProducerBase* recycle_or_create_producer(bool isExplicit)
+	{
+		bool recycled;
+		return recycle_or_create_producer(isExplicit, recycled);
+	}
+	
+	ProducerBase* recycle_or_create_producer(bool isExplicit, bool& recycled)
+	{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		// Try to re-use one first
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
+				bool expected = true;
+				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// We caught one! It's been marked as activated, the caller can have it
+					recycled = true;
+					return ptr;
+				}
+			}
+		}
+		
+		recycled = false;
+		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
+	}
+	
+	ProducerBase* add_producer(ProducerBase* producer)
+	{
+		// Handle failed memory allocation
+		if (producer == nullptr) {
+			return nullptr;
+		}
+		
+		producerCount.fetch_add(1, std::memory_order_relaxed);
+		
+		// Add it to the lock-free list
+		auto prevTail = producerListTail.load(std::memory_order_relaxed);
+		do {
+			producer->next = prevTail;
+		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		if (producer->isExplicit) {
+			auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
+			} while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+		else {
+			auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
+			} while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+#endif
+		
+		return producer;
+	}
+	
+	void reown_producers()
+	{
+		// After another instance is moved-into/swapped-with this one, all the
+		// producers we stole still think their parents are the other queue.
+		// So fix them up!
+		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
+			ptr->parent = this;
+		}
+	}
+	
+	
+	//////////////////////////////////
+	// Implicit producer hash
+	//////////////////////////////////
+	
+	struct ImplicitProducerKVP
+	{
+		std::atomic<details::thread_id_t> key;
+		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
+		
+		ImplicitProducerKVP() : value(nullptr) { }
+		
+		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+			value = other.value;
+		}
+		
+		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			swap(other);
+			return *this;
+		}
+		
+		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
+		{
+			if (this != &other) {
+				details::swap_relaxed(key, other.key);
+				std::swap(value, other.value);
+			}
+		}
+	};
+	
+	template<typename XT, typename XTraits>
+	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
+	
+	struct ImplicitProducerHash
+	{
+		size_t capacity;
+		ImplicitProducerKVP* entries;
+		ImplicitProducerHash* prev;
+	};
+	
+	inline void populate_initial_implicit_producer_hash()
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return;
+		
+		implicitProducerHashCount.store(0, std::memory_order_relaxed);
+		auto hash = &initialImplicitProducerHash;
+		hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+		hash->entries = &initialImplicitProducerHashEntries[0];
+		for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+			initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+		}
+		hash->prev = nullptr;
+		implicitProducerHash.store(hash, std::memory_order_relaxed);
+	}
+	
+	void swap_implicit_producer_hashes(ConcurrentQueue& other)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return;
+		
+		// Swap (assumes our implicit producer hash is initialized)
+		initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
+		initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
+		other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
+		
+		details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
+		
+		details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+		if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
+			implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
+		}
+		else {
+			ImplicitProducerHash* hash;
+			for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
+				continue;
+			}
+			hash->prev = &initialImplicitProducerHash;
+		}
+		if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
+			other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
+		}
+		else {
+			ImplicitProducerHash* hash;
+			for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
+				continue;
+			}
+			hash->prev = &other.initialImplicitProducerHash;
+		}
+	}
+	
+	// Only fails (returns nullptr) if memory allocation fails
+	ImplicitProducer* get_or_add_implicit_producer()
+	{
+		// Note that since the data is essentially thread-local (key is thread ID),
+		// there's a reduced need for fences (memory ordering is already consistent
+		// for any individual thread), except for the current table itself.
+		
+		// Start by looking for the thread ID in the current and all previous hash tables.
+		// If it's not found, it must not be in there yet, since this same thread would
+		// have added it previously to one of the tables that we traversed.
+		
+		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+		
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		
+		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+			// Look for the id in this hash
+			auto index = hashedId;
+			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
+				index &= hash->capacity - 1;
+				
+				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+				if (probedKey == id) {
+					// Found it! If we had to search several hashes deep, though, we should lazily add it
+					// to the current main hash table to avoid the extended search next time.
+					// Note there's guaranteed to be room in the current hash table since every subsequent
+					// table implicitly reserves space for all previous tables (there's only one
+					// implicitProducerHashCount).
+					auto value = hash->entries[index].value;
+					if (hash != mainHash) {
+						index = hashedId;
+						while (true) {
+							index &= mainHash->capacity - 1;
+							probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed);
+							auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+							auto reusable = details::invalid_thread_id2;
+							if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed, std::memory_order_relaxed)) ||
+								(probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) {
+#else
+							if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed, std::memory_order_relaxed))) {
+#endif
+								mainHash->entries[index].value = value;
+								break;
+							}
+							++index;
+						}
+					}
+					
+					return value;
+				}
+				if (probedKey == details::invalid_thread_id) {
+					break;		// Not in this hash table
+				}
+				++index;
+			}
+		}
+		
+		// Insert!
+		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+		while (true) {
+			// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
+				// We've acquired the resize lock, try to allocate a bigger hash table.
+				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
+				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
+				// locked block).
+				mainHash = implicitProducerHash.load(std::memory_order_acquire);
+				if (newCount >= (mainHash->capacity >> 1)) {
+					auto newCapacity = mainHash->capacity << 1;
+					while (newCount >= (newCapacity >> 1)) {
+						newCapacity <<= 1;
+					}
+					auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
+					if (raw == nullptr) {
+						// Allocation failed
+						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+						implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+						return nullptr;
+					}
+					
+					auto newHash = new (raw) ImplicitProducerHash;
+					newHash->capacity = newCapacity;
+					newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
+					for (size_t i = 0; i != newCapacity; ++i) {
+						new (newHash->entries + i) ImplicitProducerKVP;
+						newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+					}
+					newHash->prev = mainHash;
+					implicitProducerHash.store(newHash, std::memory_order_release);
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+					mainHash = newHash;
+				}
+				else {
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+				}
+			}
+			
+			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
+			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
+			// always be true)
+			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+				bool recycled;
+				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false, recycled));
+				if (producer == nullptr) {
+					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+					return nullptr;
+				}
+				if (recycled) {
+					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+				}
+				
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
+				producer->threadExitListener.userData = producer;
+				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+				
+				auto index = hashedId;
+				while (true) {
+					index &= mainHash->capacity - 1;
+					auto probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed);
+					
+					auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+					auto reusable = details::invalid_thread_id2;
+					if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed, std::memory_order_relaxed)) ||
+						(probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) {
+#else
+					if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed, std::memory_order_relaxed))) {
+#endif
+						mainHash->entries[index].value = producer;
+						break;
+					}
+					++index;
+				}
+				return producer;
+			}
+			
+			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
+			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
+			// we try to allocate ourselves).
+			mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		}
+	}
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	void implicit_producer_thread_exited(ImplicitProducer* producer)
+	{
+		// Remove from thread exit listeners
+		details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener);
+		
+		// Remove from hash
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		auto hash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		details::thread_id_t probedKey;
+		
+		// We need to traverse all the hashes just in case other threads aren't on the current one yet and are
+		// trying to add an entry thinking there's a free slot (because they reused a producer)
+		for (; hash != nullptr; hash = hash->prev) {
+			auto index = hashedId;
+			do {
+				index &= hash->capacity - 1;
+				probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+				if (probedKey == id) {
+					hash->entries[index].key.store(details::invalid_thread_id2, std::memory_order_release);
+					break;
+				}
+				++index;
+			} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
+		}
+		
+		// Mark the queue as being recyclable
+		producer->inactive.store(true, std::memory_order_release);
+	}
+	
+	static void implicit_producer_thread_exited_callback(void* userData)
+	{
+		auto producer = static_cast<ImplicitProducer*>(userData);
+		auto queue = producer->parent;
+		queue->implicit_producer_thread_exited(producer);
+	}
+#endif
+	
+	//////////////////////////////////
+	// Utility functions
+	//////////////////////////////////
+	
+	template<typename U>
+	static inline U* create_array(size_t count)
+	{
+		assert(count > 0);
+		auto p = static_cast<U*>((Traits::malloc)(sizeof(U) * count));
+		if (p == nullptr) {
+			return nullptr;
+		}
+		
+		for (size_t i = 0; i != count; ++i) {
+			new (p + i) U();
+		}
+		return p;
+	}
+	
+	template<typename U>
+	static inline void destroy_array(U* p, size_t count)
+	{
+		if (p != nullptr) {
+			assert(count > 0);
+			for (size_t i = count; i != 0; ) {
+				(p + --i)->~U();
+			}
+			(Traits::free)(p);
+		}
+	}
+	
+	template<typename U>
+	static inline U* create()
+	{
+		auto p = (Traits::malloc)(sizeof(U));
+		return p != nullptr ? new (p) U : nullptr;
+	}
+	
+	template<typename U, typename A1>
+	static inline U* create(A1&& a1)
+	{
+		auto p = (Traits::malloc)(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+	}
+	
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr) {
+			p->~U();
+		}
+		(Traits::free)(p);
+	}
+
+private:
+	std::atomic<ProducerBase*> producerListTail;
+	std::atomic<std::uint32_t> producerCount;
+	
+	std::atomic<size_t> initialBlockPoolIndex;
+	Block* initialBlockPool;
+	size_t initialBlockPoolSize;
+	
+#if !MCDBGQ_USEDEBUGFREELIST
+	FreeList<Block> freeList;
+#else
+	debug::DebugFreeList<Block> freeList;
+#endif
+	
+	std::atomic<ImplicitProducerHash*> implicitProducerHash;
+	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
+	ImplicitProducerHash initialImplicitProducerHash;
+	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
+	std::atomic_flag implicitProducerHashResizeInProgress;
+	
+	std::atomic<std::uint32_t> nextExplicitConsumerId;
+	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+	
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+	debug::DebugMutex implicitProdMutex;
+#endif
+	
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	std::atomic<ExplicitProducer*> explicitProducers;
+	std::atomic<ImplicitProducer*> implicitProducers;
+#endif
+};
+
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+	: producer(queue.recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = -1;
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = -1;
+}
+
+template<typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/rpmp/test/CMakeLists.txt b/rpmp/test/CMakeLists.txt
new file mode 100644
index 00000000..17a8c9e5
--- /dev/null
+++ b/rpmp/test/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_executable(unit_tests unit_test/main.cc unit_test/DigestTest.cc unit_test/CircularBufferTest.cc)
+target_link_libraries(unit_tests gtest_main pmpool)
+
+add_test(NAME unit_tests COMMAND unit_tests)
+
+add_executable(RemoteRead integration_test/RemoteRead.cc)
+target_link_libraries(RemoteRead pmpool)
diff --git a/rpmp/test/integration_test/RemoteRead.cc b/rpmp/test/integration_test/RemoteRead.cc
new file mode 100644
index 00000000..b7ac7b0c
--- /dev/null
+++ b/rpmp/test/integration_test/RemoteRead.cc
@@ -0,0 +1,67 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/benchmark/allocate_perf.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/benchmark
+ * Created Date: Friday, December 20th 2019, 8:29:23 am
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#include <string.h>
+#include <assert.h>
+#include <thread>  // NOLINT
+#include <vector>
+#include "pmpool/client/PmPoolClient.h"
+
+std::vector<char*> strs;
+char str_read[4096];
+int count = 0;
+std::mutex mtx;
+uint64_t address[2];
+
+uint64_t timestamp_now() {
+  return std::chrono::high_resolution_clock::now().time_since_epoch() /
+         std::chrono::milliseconds(1);
+}
+
+void func(PmPoolClient* client) {
+  while (true) {
+    std::unique_lock<std::mutex> lk(mtx);
+    uint64_t count_ = count++;
+    lk.unlock();
+    if (count_ < 2) {
+      client->read(address[count_], str_read, strlen(strs[count_]));
+      assert(strncmp(str_read, strs[count_], strlen(strs[count_])) == 0);
+    } else {
+      break;
+    }
+  }
+}
+
+int main() {
+  char str[] = "hello world";
+  char str1[] = "hello rpmp";
+  strs.push_back(str);
+  strs.push_back(str1);
+  std::vector<std::thread*> threads;
+  int num = 0;
+  PmPoolClient client("172.168.0.40", "12346");
+  client.init();
+  address[0] = client.write(strs[0], strlen(strs[0]));
+  address[1] = client.write(strs[1], strlen(strs[1]));
+  for (int i = 0; i < 1; i++) {
+    num++;
+    auto t = new std::thread(func, &client);
+    threads.push_back(t);
+  }
+  for (int i = 0; i < num; i++) {
+    threads[i]->join();
+    delete threads[i];
+  }
+  client.free(address[0]);
+  client.free(address[1]);
+  std::cout << "finished." << std::endl;
+  client.shutdown();
+  client.wait();
+  return 0;
+}
diff --git a/rpmp/test/unit_test/CircularBufferTest.cc b/rpmp/test/unit_test/CircularBufferTest.cc
new file mode 100644
index 00000000..e87a4bec
--- /dev/null
+++ b/rpmp/test/unit_test/CircularBufferTest.cc
@@ -0,0 +1,114 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/test/CircularBufferTest.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/test
+ * Created Date: Tuesday, December 24th 2019, 8:56:37 am
+ * Author: root
+ * 
+ * Copyright (c) 2019 Intel
+ */
+
+#include <thread>  // NOLINT
+#include <chrono>  // NOLINT
+#include <iostream>
+
+#include "../pmpool/buffer/CircularBuffer.h"
+#include "gtest/gtest.h"
+
+#define private public
+
+TEST(circularbuffer, 1B) {
+  CircularBuffer buffer(1, 10);
+  uint64_t addr = 0;
+  buffer.get(2, &addr);
+  ASSERT_EQ(addr, 0);
+  ASSERT_EQ(buffer.get_write_(), 2);
+  buffer.get(2, &addr);
+  ASSERT_EQ(addr, 2);
+  ASSERT_EQ(buffer.get_write_(), 4);
+  buffer.get(2, &addr);
+  ASSERT_EQ(addr, 4);
+  ASSERT_EQ(buffer.get_write_(), 6);
+  buffer.get(2, &addr);
+  ASSERT_EQ(addr, 6);
+  ASSERT_EQ(buffer.get_write_(), 8);
+  buffer.get(2, &addr);
+  ASSERT_EQ(addr, 8);
+  ASSERT_EQ(buffer.get_write_(), 0);
+  buffer.put(addr, 2);
+  ASSERT_EQ(buffer.get_read_(), 0);
+  addr = 0;
+  buffer.put(addr, 4);
+  ASSERT_EQ(buffer.get_read_(), 4);
+  buffer.get(3, &addr);
+  ASSERT_EQ(addr, 0);
+  ASSERT_EQ(buffer.get_write_(), 3);
+  buffer.put(4, 4);
+  ASSERT_EQ(buffer.get_read_(), 0);
+  buffer.get(4, &addr);
+  ASSERT_EQ(addr, 3);
+  ASSERT_EQ(buffer.get_write_(), 7);
+  buffer.get(3, &addr);
+  ASSERT_EQ(addr, 7);
+  ASSERT_EQ(buffer.get_write_(), 0);
+  buffer.put(5, 4);
+  ASSERT_EQ(buffer.get_read_(), 0);
+  buffer.put(1, 4);
+  ASSERT_EQ(buffer.get_read_(), 0);
+  addr = 0;
+  buffer.put(addr, 1);
+  ASSERT_EQ(buffer.get_read_(), 9);
+  buffer.put(9, 1);
+  ASSERT_EQ(buffer.get_read_(), 0);
+}
+
+TEST(circularbuffer, 4K) {
+  CircularBuffer buffer(4096, 4);
+  uint64_t addr = 0;
+  buffer.get(10, &addr);
+  ASSERT_EQ(addr, 0);
+  ASSERT_EQ(buffer.get_write_(), 1);
+  buffer.get(10, &addr);
+  ASSERT_EQ(addr, 1);
+  ASSERT_EQ(buffer.get_write_(), 2);
+  buffer.get(4097, &addr);
+  ASSERT_EQ(addr, 2);
+  ASSERT_EQ(buffer.get_write_(), 0);
+  buffer.put(2, 2);
+  ASSERT_EQ(buffer.get_read_(), 0);
+  buffer.put(3, 2);
+  ASSERT_EQ(buffer.get_read_(), 0);
+  addr = 0;
+  buffer.put(addr, 2);
+  ASSERT_EQ(buffer.get_read_(), 1);
+  buffer.put(1, 2);
+  ASSERT_EQ(buffer.get_read_(), 0);
+}
+
+void func(CircularBuffer* buffer) {
+  std::cout << "sleep..." << std::endl;
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  uint64_t addr = 0;
+  buffer->put(addr, 2);
+  std::cout << "put buffer [0, 1]..." << std::endl;
+  buffer->dump();
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  buffer->put(4, 2);
+  std::cout << "put buffer [4, 5]..." << std::endl;
+  buffer->dump();
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  buffer->put(2, 2);
+  std::cout << "put buffer [2, 3]..." << std::endl;
+  buffer->dump();
+}
+
+TEST(circularbuffer, multithread) {
+  CircularBuffer buffer(1, 8);
+  uint64_t addr = 0;
+  buffer.get(6, &addr);
+  ASSERT_EQ(addr, 0);
+  std::thread t(func, &buffer);
+  buffer.get(8, &addr);
+  std::cout << "get buffer..." << std::endl;
+  ASSERT_EQ(addr, 0);
+  t.join();
+}
diff --git a/rpmp/test/unit_test/DigestTest.cc b/rpmp/test/unit_test/DigestTest.cc
new file mode 100644
index 00000000..aa3d4ba0
--- /dev/null
+++ b/rpmp/test/unit_test/DigestTest.cc
@@ -0,0 +1,24 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/test/DigestTest.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/test
+ * Created Date: Thursday, November 7th 2019, 3:48:52 pm
+ * Author: root
+ * 
+ * Copyright (c) 2019 Intel
+ */
+
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "pmpool/Digest.h"
+#include "gtest/gtest.h"
+
+TEST(digest, compute) {
+  std::string str = "hello world";
+  uint64_t hash_value_1;
+  uint64_t hash_value_2;
+  Digest::computeKeyHash(str, &hash_value_1);
+  Digest::computeKeyHash(str, &hash_value_2);
+  ASSERT_TRUE(hash_value_1 == hash_value_2);
+}
diff --git a/rpmp/test/unit_test/main.cc b/rpmp/test/unit_test/main.cc
new file mode 100644
index 00000000..796154a1
--- /dev/null
+++ b/rpmp/test/unit_test/main.cc
@@ -0,0 +1,15 @@
+/*
+ * Filename: /mnt/spark-pmof/tool/rpmp/test/main.cc
+ * Path: /mnt/spark-pmof/tool/rpmp/test
+ * Created Date: Thursday, November 7th 2019, 3:48:52 pm
+ * Author: root
+ *
+ * Copyright (c) 2019 Intel
+ */
+
+#include "gtest/gtest.h"
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}