Skip to content

Commit

Permalink
Update main_matrix_multiplication.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
Smart781 authored Oct 6, 2024
1 parent 85bbc9e commit f35f8a5
Showing 1 changed file with 45 additions and 126 deletions.
171 changes: 45 additions & 126 deletions src/main_matrix_multiplication.cpp
Original file line number Diff line number Diff line change
@@ -1,163 +1,82 @@
#include <libutils/misc.h>
#include <libutils/timer.h>
#include <libutils/fast_random.h>
#include <libgpu/context.h>
#include <libgpu/shared_device_buffer.h>
#include <libutils/fast_random.h>
#include <libutils/misc.h>
#include <libutils/timer.h>

#include "cl/matrix_multiplication_cl.h"
#include "cl/matrix_transpose_cl.h"

#include <vector>
#include <iostream>
#include <stdexcept>
#include <vector>

const int benchmarkingIters = 10;
const int benchmarkingItersCPU = 1;
const unsigned int M = 1024;
const unsigned int K = 1024;
const unsigned int N = 1024;
const size_t gflops = ((size_t) M * K * N * 2) / (1000 * 1000 * 1000); // умножить на два, т.к. операция сложения и умножения

std::vector<float> computeCPU(const float *as, const float *bs)
{
std::vector<float> cs(M*N, 0);

timer t;
for (int iter = 0; iter < benchmarkingItersCPU; ++iter) {
for (int j = 0; j < M; ++j) {
for (int i = 0; i < N; ++i) {
float sum = 0.0f;
for (int k = 0; k < K; ++k) {
sum += as[j * K + k] * bs[k * N + i];
}
cs[j * N + i] = sum;
}
}
t.nextLap();
}

std::cout << "CPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
std::cout << "CPU: " << gflops / t.lapAvg() << " GFlops" << std::endl;

return cs;
}

struct KernelConfig {
std::string kernel_name;
gpu::WorkSize work_size;
std::string defines;
std::string prefix;
};

KernelConfig makeNaiveConfig(unsigned int tile_size)
{
throw std::runtime_error("not implemented");
std::string kernel_name = "matrix_multiplication_naive";
gpu::WorkSize work_size(0, 0/*TODO*/);
std::string defines;
std::string prefix = "[naive, ts=" + std::to_string(tile_size) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
}
const int benchmarkingIters = 100;
const unsigned int M = 4096;
const unsigned int K = 4096;

KernelConfig makeLocalConfig(unsigned int tile_size)
{
throw std::runtime_error("not implemented");
std::string kernel_name = "matrix_multiplication_local";
gpu::WorkSize work_size(0, 0/*TODO*/);
std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size);
std::string prefix = "[local, ts=" + std::to_string(tile_size) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
}

KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt)
{
throw std::runtime_error("not implemented");
std::string kernel_name = "matrix_multiplication_local_wpt";
gpu::WorkSize work_size(0, 0/*TODO*/);
std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size) + " -DWORK_PER_THREAD=" + std::to_string(wpt);
std::string prefix = "[local wpt, ts=" + std::to_string(tile_size) + ", wpt=" + std::to_string(wpt) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
}
void runTest(const std::string &kernel_name, const float *as) {
gpu::gpu_mem_32f as_gpu, as_t_gpu;
as_gpu.resizeN(M * K);
as_t_gpu.resizeN(K * M);

void runTest(const KernelConfig &config, const float *as, const float *bs, const float *cs_cpu_reference)
{
gpu::gpu_mem_32f as_gpu, bs_gpu, cs_gpu;
as_gpu.resizeN(M*K);
bs_gpu.resizeN(K*N);
cs_gpu.resizeN(M*N);
as_gpu.writeN(as, M * K);

as_gpu.writeN(as, M*K);
bs_gpu.writeN(bs, K*N);

ocl::Kernel matrix_multiplication_kernel(matrix_multiplication, matrix_multiplication_length, config.kernel_name, config.defines);
matrix_multiplication_kernel.compile();
ocl::Kernel matrix_transpose_kernel(matrix_transpose, matrix_transpose_length, kernel_name);
matrix_transpose_kernel.compile();

timer t;
for (int iter = 0; iter < benchmarkingIters; ++iter) {
matrix_multiplication_kernel.exec(config.work_size, as_gpu, bs_gpu, cs_gpu, M, K, N);
// Для этой задачи естественнее использовать двухмерный NDRange. Чтобы это сформулировать
// в терминологии библиотеки - нужно вызвать другую вариацию конструктора WorkSize.
// В CLion удобно смотреть какие есть вариант аргументов в конструкторах:
// поставьте каретку редактирования кода внутри скобок конструктора WorkSize -> Ctrl+P -> заметьте что есть 2, 4 и 6 параметров
// - для 1D, 2D и 3D рабочего пространства соответственно

// TODO uncomment
gpu::WorkSize work_size(8, 8, M, K /*TODO*/);
matrix_transpose_kernel.exec(work_size, as_gpu, as_t_gpu, M, K);

t.nextLap();
}

std::cout << config.prefix << std::endl;
std::cout << "[" << kernel_name << "]" << std::endl;
std::cout << " GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
std::cout << " GPU: " << gflops / t.lapAvg() << " GFlops" << std::endl;
std::cout << " GPU: " << M * K / 1000.0 / 1000.0 / t.lapAvg() << " millions/s" << std::endl;

std::vector<float> cs(M*N, 0);
cs_gpu.readN(cs.data(), M*N);
std::vector<float> as_t(M * K, 0);
as_t_gpu.readN(as_t.data(), M * K);

// Проверяем корректность результатов
double diff_sum = 0;
for (int i = 0; i < M * N; ++i) {
double a = cs[i];
double b = cs_cpu_reference[i];
if (a != 0.0 || b != 0.0) {
double diff = fabs(a - b) / std::max(fabs(a), fabs(b));
diff_sum += diff;
for (int j = 0; j < M; ++j) {
for (int i = 0; i < K; ++i) {
float a = as[j * K + i];
float b = as_t[i * M + j];
if (a != b) {
throw std::runtime_error("Not the same!");
}
}
}

double diff_avg = diff_sum / (M * N);
std::cout <<" Average difference: " << diff_avg * 100.0 << "%" << std::endl;
if (diff_avg > 0.01) {
throw std::runtime_error("Too big difference!");
}
}

int main(int argc, char **argv)
{
int main(int argc, char **argv) {
gpu::Device device = gpu::chooseGPUDevice(argc, argv);

gpu::Context context;
context.init(device.device_id_opencl);
context.activate();

std::vector<float> as(M*K, 0);
std::vector<float> bs(K*N, 0);
FastRandom r(M+K+N);
std::vector<float> as(M * K, 0);
FastRandom r(M + K);
for (unsigned int i = 0; i < as.size(); ++i) {
as[i] = r.nextf();
}
for (unsigned int i = 0; i < bs.size(); ++i) {
bs[i] = r.nextf();
}
std::cout << "Data generated for M=" << M << ", K=" << K << ", N=" << N << std::endl;

const std::vector<float> cs_cpu_reference = computeCPU(as.data(), bs.data());

// TODO uncomment
return 0;

runTest(makeNaiveConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
runTest(makeNaiveConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
runTest(makeNaiveConfig(16), as.data(), bs.data(), cs_cpu_reference.data());

runTest(makeLocalConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
runTest(makeLocalConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
runTest(makeLocalConfig(16), as.data(), bs.data(), cs_cpu_reference.data());
std::cout << "Data generated for M=" << M << ", K=" << K << std::endl;


for (unsigned int tile_size : {4, 8, 16})
for (unsigned int wpt : {2, 4, 8, 16})
if (wpt <= tile_size)
runTest(makeLocalWPTConfig(tile_size, wpt), as.data(), bs.data(), cs_cpu_reference.data());
runTest("matrix_transpose_naive", as.data());
runTest("matrix_transpose_local_bad_banks", as.data());
runTest("matrix_transpose_local_good_banks", as.data());

return 0;

}

0 comments on commit f35f8a5

Please sign in to comment.