forked from GPGPUCourse/GPGPUTasks2024
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update main_matrix_multiplication.cpp
- Loading branch information
Showing
1 changed file
with
45 additions
and
126 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,163 +1,82 @@ | ||
#include <libutils/misc.h> | ||
#include <libutils/timer.h> | ||
#include <libutils/fast_random.h> | ||
#include <libgpu/context.h> | ||
#include <libgpu/shared_device_buffer.h> | ||
#include <libutils/fast_random.h> | ||
#include <libutils/misc.h> | ||
#include <libutils/timer.h> | ||
|
||
#include "cl/matrix_multiplication_cl.h" | ||
#include "cl/matrix_transpose_cl.h" | ||
|
||
#include <vector> | ||
#include <iostream> | ||
#include <stdexcept> | ||
#include <vector> | ||
|
||
const int benchmarkingIters = 10; | ||
const int benchmarkingItersCPU = 1; | ||
const unsigned int M = 1024; | ||
const unsigned int K = 1024; | ||
const unsigned int N = 1024; | ||
const size_t gflops = ((size_t) M * K * N * 2) / (1000 * 1000 * 1000); // умножить на два, т.к. операция сложения и умножения | ||
|
||
std::vector<float> computeCPU(const float *as, const float *bs) | ||
{ | ||
std::vector<float> cs(M*N, 0); | ||
|
||
timer t; | ||
for (int iter = 0; iter < benchmarkingItersCPU; ++iter) { | ||
for (int j = 0; j < M; ++j) { | ||
for (int i = 0; i < N; ++i) { | ||
float sum = 0.0f; | ||
for (int k = 0; k < K; ++k) { | ||
sum += as[j * K + k] * bs[k * N + i]; | ||
} | ||
cs[j * N + i] = sum; | ||
} | ||
} | ||
t.nextLap(); | ||
} | ||
|
||
std::cout << "CPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl; | ||
std::cout << "CPU: " << gflops / t.lapAvg() << " GFlops" << std::endl; | ||
|
||
return cs; | ||
} | ||
|
||
struct KernelConfig { | ||
std::string kernel_name; | ||
gpu::WorkSize work_size; | ||
std::string defines; | ||
std::string prefix; | ||
}; | ||
|
||
KernelConfig makeNaiveConfig(unsigned int tile_size) | ||
{ | ||
throw std::runtime_error("not implemented"); | ||
std::string kernel_name = "matrix_multiplication_naive"; | ||
gpu::WorkSize work_size(0, 0/*TODO*/); | ||
std::string defines; | ||
std::string prefix = "[naive, ts=" + std::to_string(tile_size) + "]"; | ||
return KernelConfig{kernel_name, work_size, defines, prefix}; | ||
} | ||
const int benchmarkingIters = 100; | ||
const unsigned int M = 4096; | ||
const unsigned int K = 4096; | ||
|
||
KernelConfig makeLocalConfig(unsigned int tile_size) | ||
{ | ||
throw std::runtime_error("not implemented"); | ||
std::string kernel_name = "matrix_multiplication_local"; | ||
gpu::WorkSize work_size(0, 0/*TODO*/); | ||
std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size); | ||
std::string prefix = "[local, ts=" + std::to_string(tile_size) + "]"; | ||
return KernelConfig{kernel_name, work_size, defines, prefix}; | ||
} | ||
|
||
KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt) | ||
{ | ||
throw std::runtime_error("not implemented"); | ||
std::string kernel_name = "matrix_multiplication_local_wpt"; | ||
gpu::WorkSize work_size(0, 0/*TODO*/); | ||
std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size) + " -DWORK_PER_THREAD=" + std::to_string(wpt); | ||
std::string prefix = "[local wpt, ts=" + std::to_string(tile_size) + ", wpt=" + std::to_string(wpt) + "]"; | ||
return KernelConfig{kernel_name, work_size, defines, prefix}; | ||
} | ||
void runTest(const std::string &kernel_name, const float *as) { | ||
gpu::gpu_mem_32f as_gpu, as_t_gpu; | ||
as_gpu.resizeN(M * K); | ||
as_t_gpu.resizeN(K * M); | ||
|
||
void runTest(const KernelConfig &config, const float *as, const float *bs, const float *cs_cpu_reference) | ||
{ | ||
gpu::gpu_mem_32f as_gpu, bs_gpu, cs_gpu; | ||
as_gpu.resizeN(M*K); | ||
bs_gpu.resizeN(K*N); | ||
cs_gpu.resizeN(M*N); | ||
as_gpu.writeN(as, M * K); | ||
|
||
as_gpu.writeN(as, M*K); | ||
bs_gpu.writeN(bs, K*N); | ||
|
||
ocl::Kernel matrix_multiplication_kernel(matrix_multiplication, matrix_multiplication_length, config.kernel_name, config.defines); | ||
matrix_multiplication_kernel.compile(); | ||
ocl::Kernel matrix_transpose_kernel(matrix_transpose, matrix_transpose_length, kernel_name); | ||
matrix_transpose_kernel.compile(); | ||
|
||
timer t; | ||
for (int iter = 0; iter < benchmarkingIters; ++iter) { | ||
matrix_multiplication_kernel.exec(config.work_size, as_gpu, bs_gpu, cs_gpu, M, K, N); | ||
// Для этой задачи естественнее использовать двухмерный NDRange. Чтобы это сформулировать | ||
// в терминологии библиотеки - нужно вызвать другую вариацию конструктора WorkSize. | ||
// В CLion удобно смотреть какие есть вариант аргументов в конструкторах: | ||
// поставьте каретку редактирования кода внутри скобок конструктора WorkSize -> Ctrl+P -> заметьте что есть 2, 4 и 6 параметров | ||
// - для 1D, 2D и 3D рабочего пространства соответственно | ||
|
||
// TODO uncomment | ||
gpu::WorkSize work_size(8, 8, M, K /*TODO*/); | ||
matrix_transpose_kernel.exec(work_size, as_gpu, as_t_gpu, M, K); | ||
|
||
t.nextLap(); | ||
} | ||
|
||
std::cout << config.prefix << std::endl; | ||
std::cout << "[" << kernel_name << "]" << std::endl; | ||
std::cout << " GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl; | ||
std::cout << " GPU: " << gflops / t.lapAvg() << " GFlops" << std::endl; | ||
std::cout << " GPU: " << M * K / 1000.0 / 1000.0 / t.lapAvg() << " millions/s" << std::endl; | ||
|
||
std::vector<float> cs(M*N, 0); | ||
cs_gpu.readN(cs.data(), M*N); | ||
std::vector<float> as_t(M * K, 0); | ||
as_t_gpu.readN(as_t.data(), M * K); | ||
|
||
// Проверяем корректность результатов | ||
double diff_sum = 0; | ||
for (int i = 0; i < M * N; ++i) { | ||
double a = cs[i]; | ||
double b = cs_cpu_reference[i]; | ||
if (a != 0.0 || b != 0.0) { | ||
double diff = fabs(a - b) / std::max(fabs(a), fabs(b)); | ||
diff_sum += diff; | ||
for (int j = 0; j < M; ++j) { | ||
for (int i = 0; i < K; ++i) { | ||
float a = as[j * K + i]; | ||
float b = as_t[i * M + j]; | ||
if (a != b) { | ||
throw std::runtime_error("Not the same!"); | ||
} | ||
} | ||
} | ||
|
||
double diff_avg = diff_sum / (M * N); | ||
std::cout <<" Average difference: " << diff_avg * 100.0 << "%" << std::endl; | ||
if (diff_avg > 0.01) { | ||
throw std::runtime_error("Too big difference!"); | ||
} | ||
} | ||
|
||
int main(int argc, char **argv) | ||
{ | ||
int main(int argc, char **argv) { | ||
gpu::Device device = gpu::chooseGPUDevice(argc, argv); | ||
|
||
gpu::Context context; | ||
context.init(device.device_id_opencl); | ||
context.activate(); | ||
|
||
std::vector<float> as(M*K, 0); | ||
std::vector<float> bs(K*N, 0); | ||
FastRandom r(M+K+N); | ||
std::vector<float> as(M * K, 0); | ||
FastRandom r(M + K); | ||
for (unsigned int i = 0; i < as.size(); ++i) { | ||
as[i] = r.nextf(); | ||
} | ||
for (unsigned int i = 0; i < bs.size(); ++i) { | ||
bs[i] = r.nextf(); | ||
} | ||
std::cout << "Data generated for M=" << M << ", K=" << K << ", N=" << N << std::endl; | ||
|
||
const std::vector<float> cs_cpu_reference = computeCPU(as.data(), bs.data()); | ||
|
||
// TODO uncomment | ||
return 0; | ||
|
||
runTest(makeNaiveConfig(4), as.data(), bs.data(), cs_cpu_reference.data()); | ||
runTest(makeNaiveConfig(8), as.data(), bs.data(), cs_cpu_reference.data()); | ||
runTest(makeNaiveConfig(16), as.data(), bs.data(), cs_cpu_reference.data()); | ||
|
||
runTest(makeLocalConfig(4), as.data(), bs.data(), cs_cpu_reference.data()); | ||
runTest(makeLocalConfig(8), as.data(), bs.data(), cs_cpu_reference.data()); | ||
runTest(makeLocalConfig(16), as.data(), bs.data(), cs_cpu_reference.data()); | ||
std::cout << "Data generated for M=" << M << ", K=" << K << std::endl; | ||
|
||
|
||
for (unsigned int tile_size : {4, 8, 16}) | ||
for (unsigned int wpt : {2, 4, 8, 16}) | ||
if (wpt <= tile_size) | ||
runTest(makeLocalWPTConfig(tile_size, wpt), as.data(), bs.data(), cs_cpu_reference.data()); | ||
runTest("matrix_transpose_naive", as.data()); | ||
runTest("matrix_transpose_local_bad_banks", as.data()); | ||
runTest("matrix_transpose_local_good_banks", as.data()); | ||
|
||
return 0; | ||
|
||
} |