Skip to content

Commit

Permalink
Experimental, change later
Browse files Browse the repository at this point in the history
  • Loading branch information
maj113 committed May 24, 2024
1 parent 2d45089 commit 4c857bb
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 80 deletions.
11 changes: 9 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,18 @@ cmake_minimum_required(VERSION 3.5)

project(Counter)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD 23)
include_directories(${CMAKE_SOURCE_DIR})

if(CMAKE_BUILD_TYPE MATCHES "Debug")
add_definitions(-DAVXDEBUG)
endif()

if(MSVC)
add_compile_options(/GA /EHsc /fp:fast /MP /arch:AVX2)

if(CMAKE_BUILD_TYPE MATCHES "Debug")
add_compile_options(/Od /Zi /Zf)
add_compile_options(/Od /Zi /DEBUG /Zf /MTd /Wall /analyze)
endif()

if(CMAKE_BUILD_TYPE MATCHES "Release")
Expand All @@ -21,6 +25,9 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")

if(CMAKE_BUILD_TYPE MATCHES "Debug")
add_compile_options(-Og -g3)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
add_compile_options(-fanalyzer)
endif()
endif()

if(CMAKE_BUILD_TYPE MATCHES "Release")
Expand Down
109 changes: 72 additions & 37 deletions benchmark/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@
#include <chrono>
#include <string>
#include <algorithm>
#include <cstdint>
#include <cstring>
#include "defs.hpp"
#include "../defs.hpp"

using namespace std;
using namespace chrono;

void printTime(const nanoseconds &duration) {
double time = duration.count();
double time = static_cast<double>(duration.count());
std::string_view unit;
if (time >= 1e9) {
time *= 1e-9;
Expand All @@ -28,17 +27,46 @@ void printTime(const nanoseconds &duration) {
cout << fixed << setprecision(2) << time << unit;
}

uint64_t bench(const char *data, uint64_t dataSize, uint64_t numIterations, bool useOptimized, bool singleThreaded) {
uint64_t bench(const char *data, uint64_t dataSize, uint8_t numIterations, int method, bool stressTest = false) noexcept {

if (stressTest) {
uint64_t result = 0;
for (uint8_t i = 0; i < numIterations * 100; ++i)
result += counter(data, data + dataSize, 10, false);
cout << result << endl;
return result;
}

auto printMethod = [](int method) -> void {
switch (method) {
case COUNT_STANDARD: cout << "Single Threaded Std:\n"; break;
case COUNT_OPTIMIZED: cout << "Single Threaded Opt:\n"; break;
case COUNT_OPTIMIZED_PARALLEL: cout << "Multi Threaded Opt:\n"; break;
default: cerr << "Invalid method specified!\n"; return;
}
};

auto setCount = [&data, &dataSize](int method) -> uint64_t {
switch (method) {
case COUNT_STANDARD:
return std::count(data, data + dataSize, 10);
case COUNT_OPTIMIZED:
return counter(data, data + dataSize, 10, true);
case COUNT_OPTIMIZED_PARALLEL:
return counter(data, data + dataSize, 10, false);
default:
cerr << "Invalid method specified!\n";
return 0;
}
};

printMethod(method);

auto cumulativeTime = nanoseconds(0);
string methodName = useOptimized ? "Opt" : "Std";
string threadingMode = singleThreaded ? "Single Threaded" : "Multi Threaded";
cout << "Benchmarking " << threadingMode << " " << methodName << ":\n";

for (uint64_t i = 0; i < numIterations; ++i) {
auto strt = high_resolution_clock::now();
uint64_t count = useOptimized
? opt_count_parallel(
data, data + dataSize, 10, singleThreaded)
: std::count(data, data + dataSize, 10);
uint64_t count = setCount(method);
auto stop = high_resolution_clock::now();
auto duration = stop - strt;
cumulativeTime += duration;
Expand All @@ -61,42 +89,49 @@ uint64_t bench(const char *data, uint64_t dataSize, uint64_t numIterations, bool
return cumulativeTime.count();
}

int main() {
int main(int argc, char* argv[]) {
constexpr uint64_t dataSize = 10000000000;
constexpr size_t alignment = 32;
constexpr uint8_t numIterations = 10;

char *data_unaligned = new char[dataSize + alignment];
char* data_unaligned = new char[dataSize + alignment];

auto raw_address = reinterpret_cast<uintptr_t>(data_unaligned);
size_t adjustment = alignment - (raw_address % alignment);

char *data = data_unaligned + adjustment;
char* data = data_unaligned + adjustment;

memset(data, '\n', dataSize);

for (uint64_t i = 0; i < dataSize; i += 2)
data[i] = 'x';

constexpr uint64_t numIterations = 10;

uint64_t opt_single_cumulative = bench(data, dataSize, numIterations, true, true);
uint64_t opt_multi_cumulative = bench(data, dataSize, numIterations, true, false);
uint64_t std_single_cumulative = bench(data, dataSize, numIterations, false, true);

double improvement_single = ((double) std_single_cumulative - (double) opt_single_cumulative) / (double)
std_single_cumulative * 100.0;
double improvement_multi = ((double) std_single_cumulative - (double) opt_multi_cumulative) / (double)
std_single_cumulative * 100.0;

cout << "Improvement over std::count: single threaded: " << fixed << setprecision(2)
<< (improvement_single >= 0 ? "" : "-") << abs(improvement_single)
<< "% multi threaded: " << fixed << setprecision(2)
<< (improvement_multi >= 0 ? "" : "-") << abs(improvement_multi) << "%" << endl;

cout << "Times faster over std::count: single threaded: " << fixed << setprecision(2)
<< ((double)std_single_cumulative / (double)opt_single_cumulative >= 0 ? "" : "-")
<< abs((double)std_single_cumulative / (double)opt_single_cumulative)
<< "x multi threaded: " << fixed << setprecision(2)
<< ((double)std_single_cumulative / (double)opt_multi_cumulative >= 0 ? "" : "-")
<< abs((double)std_single_cumulative / (double)opt_multi_cumulative) << "x" << endl;
if (argc > 1 && strcmp(argv[1], "--stress") == 0) {
for (uint8_t i = 0; i < numIterations; ++i)
bench(data, dataSize, numIterations, COUNT_OPTIMIZED_PARALLEL, true);
cout << "Stress test done!" << endl;

} else {
uint64_t opt_single_cumulative = bench(data, dataSize, numIterations, COUNT_OPTIMIZED);
uint64_t opt_multi_cumulative = bench(data, dataSize, numIterations, COUNT_OPTIMIZED_PARALLEL);
uint64_t std_single_cumulative = bench(data, dataSize, numIterations, COUNT_STANDARD);

double improvement_single = ((double)std_single_cumulative - (double)opt_single_cumulative) / (double)
std_single_cumulative * 100.0;
double improvement_multi = ((double)std_single_cumulative - (double)opt_multi_cumulative) / (double)
std_single_cumulative * 100.0;

cout << "Improvement over std::count: single threaded: " << fixed << setprecision(2)
<< (improvement_single >= 0 ? "" : "-") << abs(improvement_single)
<< "% multi threaded: " << fixed << setprecision(2)
<< (improvement_multi >= 0 ? "" : "-") << abs(improvement_multi) << "%" << endl;

cout << "Times faster over std::count: single threaded: " << fixed << setprecision(2)
<< ((double)std_single_cumulative / (double)opt_single_cumulative >= 0 ? "" : "-")
<< abs((double)std_single_cumulative / (double)opt_single_cumulative)
<< "x multi threaded: " << fixed << setprecision(2)
<< ((double)std_single_cumulative / (double)opt_multi_cumulative >= 0 ? "" : "-")
<< abs((double)std_single_cumulative / (double)opt_multi_cumulative) << "x" << endl;
}

delete[] data_unaligned;
return 0;
Expand Down
82 changes: 58 additions & 24 deletions counter.cpp
Original file line number Diff line number Diff line change
@@ -1,62 +1,96 @@
#include "defs.hpp"
#include <immintrin.h>
#include <cstdint>
#include <vector>
#include <future>
#include <array>


#ifdef AVXDEBUG
inline void p256_hex_8(__m256i in) {
alignas(16) uint8_t v[32];
_mm256_store_si256((__m256i*)v, in);
printf("v32_i8: %x %x %x %x | %x %x %x %x | %x %x %x %x | %x %x %x %x | "
"%x %x %x %x | %x %x %x %x | %x %x %x %x | %x %x %x %x\n",
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15],
v[16], v[17], v[18], v[19], v[20], v[21], v[22], v[23],
v[24], v[25], v[26], v[27], v[28], v[29], v[30], v[31]);
}

inline void p256_hex_64(__m256i in) {
alignas(16) int64_t v[4];
_mm256_store_si256((__m256i*)v, in);
printf("v4_i64: %lli %lli %lli %lli | %llx %llx %llx %llx\n",
v[0], v[1], v[2], v[3], v[0], v[1], v[2], v[3]);
}
#endif

// @powturbo's code with slight modifications
inline uint64_t opt_count(const char *s, const char *e, const char c) {
inline int64_t opt_count(const char *s, const char *e, const char c) noexcept {
const __m256i cv = _mm256_set1_epi8(c), zv = _mm256_setzero_si256();
__m256i sum = zv, acr0, acr1, acr2, acr3;

__m256i sum = zv, acr0;
constexpr int16_t acrlimit = 255 * 32;
const char *pe;
while (s != e - (e - s) % (252 * 32)) {
for (acr0 = acr1 = acr2 = acr3 = zv, pe = s + 252 * 32; s != pe; s += 128) {

while (s + acrlimit < e) {
for (acr0 = zv, pe = s + acrlimit; s < pe; s += 160) {
#ifdef AVXDEBUG // Dump signed 8-bit hex values from accumulators
acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_load_si256((const __m256i *) s)));
p256_hex_8(acr0);
acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_load_si256((const __m256i *) (s + 32))));
p256_hex_8(acr0);
acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_load_si256((const __m256i *) (s + 64))));
p256_hex_8(acr0);
acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_load_si256((const __m256i *) (s + 96))));
p256_hex_8(acr0);
acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_load_si256((const __m256i *) (s + 128))));
p256_hex_8(acr0);
#else
acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_load_si256((const __m256i *) s)));
acr1 = _mm256_sub_epi8(acr1, _mm256_cmpeq_epi8(cv, _mm256_load_si256((const __m256i *) (s + 32))));
acr2 = _mm256_sub_epi8(acr2, _mm256_cmpeq_epi8(cv, _mm256_load_si256((const __m256i *) (s + 64))));
acr3 = _mm256_sub_epi8(acr3, _mm256_cmpeq_epi8(cv, _mm256_load_si256((const __m256i *) (s + 96))));
_mm_prefetch(s + 1024, _MM_HINT_T0);
acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_load_si256((const __m256i *) (s + 32))));
acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_load_si256((const __m256i *) (s + 64))));
acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_load_si256((const __m256i *) (s + 96))));
acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_load_si256((const __m256i *) (s + 128))));

#endif
}
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr0, zv));
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr1, zv));
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr2, zv));
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr3, zv));
#ifdef AVXDEBUG
p256_hex_64(sum);
#endif
}

for (acr0 = zv; s + 32 < e; s += 32)
acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_load_si256((const __m256i *) s)));
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr0, zv));

uint64_t count =
int64_t count =
_mm256_extract_epi64(sum, 0)
+ _mm256_extract_epi64(sum, 1)
+ _mm256_extract_epi64(sum, 2)
+ _mm256_extract_epi64(sum, 3);

// Using != is unsafe, use a stricter check
// Using != could check outside the boundary
while(s < e)
count += *s++ == c;

return count;
}

uint64_t opt_count_parallel(const char *begin, const char *end, const char target, bool singleThreaded) noexcept {
uint64_t counter(const char *begin, const char *end, const char target, bool singleThreaded) noexcept {
if (singleThreaded)
return opt_count(begin, end, target);

const unsigned int num_threads = std::thread::hardware_concurrency();
const size_t total_length = end - begin;

// FIXME: Don't multiply by 1000 when used with verifier
if (total_length < num_threads * 1000)
return opt_count(begin, end, target);
//if (total_length < num_threads * 1000)
// return opt_count(begin, end, target);

const size_t chunk_size = (total_length + num_threads - 1) / num_threads;

std::vector<std::future<uint64_t>> futures(num_threads);
std::array<std::future<int64_t>, num_threads> futures;

uint64_t total_count = 0;
int64_t total_count = 0;

for (unsigned int i = 0; i < num_threads; ++i) {
const char *chunk_begin = begin + i * chunk_size;
Expand All @@ -73,9 +107,9 @@ uint64_t opt_count_parallel(const char *begin, const char *end, const char targe
return total_count;
}

uint64_t opt_count_parallel(const char *begin, const char *end, const int target, bool singleThreaded) noexcept {
uint64_t counter(const char *begin, const char *end, const int target, bool singleThreaded) noexcept {
// Horrible code
if (target >= 0 && target <= 127)
return opt_count_parallel(begin, end, static_cast<const char>(target), singleThreaded);
return counter(begin, end, static_cast<const char>(target), singleThreaded);
return 0;
}
9 changes: 7 additions & 2 deletions defs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

#include <cstdint>

#define COUNT_STANDARD 0
#define COUNT_OPTIMIZED 1
#define COUNT_OPTIMIZED_PARALLEL 2
constexpr unsigned int num_threads = 20;

/**
* @brief Counts occurrences of a target character in a given range of characters using parallel execution.
*
Expand All @@ -15,6 +20,6 @@
* Defaults to false.
* @return The total count of occurrences of the target character in the specified range.
*/
uint64_t opt_count_parallel(const char *begin, const char *end, char target, bool singleThreaded) noexcept;
uint64_t counter(const char *begin, const char *end, char target, bool singleThreaded) noexcept;

uint64_t opt_count_parallel(const char *begin, const char *end, int target, bool singleThreaded) noexcept;
uint64_t counter(const char *begin, const char *end, int target, bool singleThreaded) noexcept;
3 changes: 2 additions & 1 deletion verify/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
add_executable(verify_counter ../counter.cpp verifier.cpp)
add_executable(verify_counter ../counter.cpp verifier.cpp)
remove_definitions(-DAVXDEBUG)
Loading

0 comments on commit 4c857bb

Please sign in to comment.