Update main_matrix_multiplication.cpp

Smart781 · Oct 6, 2024 · f35f8a5 · f35f8a5
1 parent 85bbc9e
commit f35f8a5
Showing 1 changed file with 45 additions and 126 deletions.
diff --git a/src/main_matrix_multiplication.cpp b/src/main_matrix_multiplication.cpp
@@ -1,163 +1,82 @@
-#include <libutils/misc.h>
-#include <libutils/timer.h>
-#include <libutils/fast_random.h>
 #include <libgpu/context.h>
 #include <libgpu/shared_device_buffer.h>
+#include <libutils/fast_random.h>
+#include <libutils/misc.h>
+#include <libutils/timer.h>
 
-#include "cl/matrix_multiplication_cl.h"
+#include "cl/matrix_transpose_cl.h"
 
-#include <vector>
 #include <iostream>
 #include <stdexcept>
+#include <vector>
 
-const int benchmarkingIters = 10;
-const int benchmarkingItersCPU = 1;
-const unsigned int M = 1024;
-const unsigned int K = 1024;
-const unsigned int N = 1024;
-const size_t gflops = ((size_t) M * K * N * 2) / (1000 * 1000 * 1000); // умножить на два, т.к. операция сложения и умножения
-
-std::vector<float> computeCPU(const float *as, const float *bs)
-{
-    std::vector<float> cs(M*N, 0);
-
-    timer t;
-    for (int iter = 0; iter < benchmarkingItersCPU; ++iter) {
-        for (int j = 0; j < M; ++j) {
-            for (int i = 0; i < N; ++i) {
-                float sum = 0.0f;
-                for (int k = 0; k < K; ++k) {
-                    sum += as[j * K + k] * bs[k * N + i];
-                }
-                cs[j * N + i] = sum;
-            }
-        }
-        t.nextLap();
-    }
-
-    std::cout << "CPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
-    std::cout << "CPU: " << gflops / t.lapAvg() << " GFlops" << std::endl;
-
-    return cs;
-}
-
-struct KernelConfig {
-    std::string kernel_name;
-    gpu::WorkSize work_size;
-    std::string defines;
-    std::string prefix;
-};
-
-KernelConfig makeNaiveConfig(unsigned int tile_size)
-{
-    throw std::runtime_error("not implemented");
-    std::string kernel_name = "matrix_multiplication_naive";
-    gpu::WorkSize work_size(0, 0/*TODO*/);
-    std::string defines;
-    std::string prefix = "[naive, ts=" + std::to_string(tile_size) + "]";
-    return KernelConfig{kernel_name, work_size, defines, prefix};
-}
+const int benchmarkingIters = 100;
+const unsigned int M = 4096;
+const unsigned int K = 4096;
 
-KernelConfig makeLocalConfig(unsigned int tile_size)
-{
-    throw std::runtime_error("not implemented");
-    std::string kernel_name = "matrix_multiplication_local";
-    gpu::WorkSize work_size(0, 0/*TODO*/);
-    std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size);
-    std::string prefix = "[local, ts=" + std::to_string(tile_size) + "]";
-    return KernelConfig{kernel_name, work_size, defines, prefix};
-}
-
-KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt)
-{
-    throw std::runtime_error("not implemented");
-    std::string kernel_name = "matrix_multiplication_local_wpt";
-    gpu::WorkSize work_size(0, 0/*TODO*/);
-    std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size) + " -DWORK_PER_THREAD=" + std::to_string(wpt);
-    std::string prefix = "[local wpt, ts=" + std::to_string(tile_size) + ", wpt=" + std::to_string(wpt) + "]";
-    return KernelConfig{kernel_name, work_size, defines, prefix};
-}
+void runTest(const std::string &kernel_name, const float *as) {
+    gpu::gpu_mem_32f as_gpu, as_t_gpu;
+    as_gpu.resizeN(M * K);
+    as_t_gpu.resizeN(K * M);
 
-void runTest(const KernelConfig &config, const float *as, const float *bs, const float *cs_cpu_reference)
-{
-    gpu::gpu_mem_32f as_gpu, bs_gpu, cs_gpu;
-    as_gpu.resizeN(M*K);
-    bs_gpu.resizeN(K*N);
-    cs_gpu.resizeN(M*N);
+    as_gpu.writeN(as, M * K);
 
-    as_gpu.writeN(as, M*K);
-    bs_gpu.writeN(bs, K*N);
-
-    ocl::Kernel matrix_multiplication_kernel(matrix_multiplication, matrix_multiplication_length, config.kernel_name, config.defines);
-    matrix_multiplication_kernel.compile();
+    ocl::Kernel matrix_transpose_kernel(matrix_transpose, matrix_transpose_length, kernel_name);
+    matrix_transpose_kernel.compile();
 
     timer t;
     for (int iter = 0; iter < benchmarkingIters; ++iter) {
-        matrix_multiplication_kernel.exec(config.work_size, as_gpu, bs_gpu, cs_gpu, M, K, N);
+        // Для этой задачи естественнее использовать двухмерный NDRange. Чтобы это сформулировать
+        // в терминологии библиотеки - нужно вызвать другую вариацию конструктора WorkSize.
+        // В CLion удобно смотреть какие есть вариант аргументов в конструкторах:
+        // поставьте каретку редактирования кода внутри скобок конструктора WorkSize -> Ctrl+P -> заметьте что есть 2, 4 и 6 параметров
+        // - для 1D, 2D и 3D рабочего пространства соответственно
+
+        // TODO uncomment
+        gpu::WorkSize work_size(8, 8, M, K /*TODO*/);
+        matrix_transpose_kernel.exec(work_size, as_gpu, as_t_gpu, M, K);
+
         t.nextLap();
     }
 
-    std::cout << config.prefix << std::endl;
+    std::cout << "[" << kernel_name << "]" << std::endl;
     std::cout << "    GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
-    std::cout << "    GPU: " << gflops / t.lapAvg() << " GFlops" << std::endl;
+    std::cout << "    GPU: " << M * K / 1000.0 / 1000.0 / t.lapAvg() << " millions/s" << std::endl;
 
-    std::vector<float> cs(M*N, 0);
-    cs_gpu.readN(cs.data(), M*N);
+    std::vector<float> as_t(M * K, 0);
+    as_t_gpu.readN(as_t.data(), M * K);
 
     // Проверяем корректность результатов
-    double diff_sum = 0;
-    for (int i = 0; i < M * N; ++i) {
-        double a = cs[i];
-        double b = cs_cpu_reference[i];
-        if (a != 0.0 || b != 0.0) {
-            double diff = fabs(a - b) / std::max(fabs(a), fabs(b));
-            diff_sum += diff;
+    for (int j = 0; j < M; ++j) {
+        for (int i = 0; i < K; ++i) {
+            float a = as[j * K + i];
+            float b = as_t[i * M + j];
+            if (a != b) {
+                throw std::runtime_error("Not the same!");
+            }
         }
     }
-
-    double diff_avg = diff_sum / (M * N);
-    std::cout <<"    Average difference: " << diff_avg * 100.0 << "%" << std::endl;
-    if (diff_avg > 0.01) {
-        throw std::runtime_error("Too big difference!");
-    }
 }
 
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
     gpu::Device device = gpu::chooseGPUDevice(argc, argv);
 
     gpu::Context context;
     context.init(device.device_id_opencl);
     context.activate();
 
-    std::vector<float> as(M*K, 0);
-    std::vector<float> bs(K*N, 0);
-    FastRandom r(M+K+N);
+    std::vector<float> as(M * K, 0);
+    FastRandom r(M + K);
     for (unsigned int i = 0; i < as.size(); ++i) {
         as[i] = r.nextf();
     }
-    for (unsigned int i = 0; i < bs.size(); ++i) {
-        bs[i] = r.nextf();
-    }
-    std::cout << "Data generated for M=" << M << ", K=" << K << ", N=" << N << std::endl;
-
-    const std::vector<float> cs_cpu_reference = computeCPU(as.data(), bs.data());
-
-    // TODO uncomment
-    return 0;
-
-    runTest(makeNaiveConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
-    runTest(makeNaiveConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
-    runTest(makeNaiveConfig(16), as.data(), bs.data(), cs_cpu_reference.data());
-
-    runTest(makeLocalConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
-    runTest(makeLocalConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
-    runTest(makeLocalConfig(16), as.data(), bs.data(), cs_cpu_reference.data());
+    std::cout << "Data generated for M=" << M << ", K=" << K << std::endl;
+
 
-    for (unsigned int tile_size : {4, 8, 16})
-        for (unsigned int wpt : {2, 4, 8, 16})
-            if (wpt <= tile_size)
-                runTest(makeLocalWPTConfig(tile_size, wpt), as.data(), bs.data(), cs_cpu_reference.data());
+    runTest("matrix_transpose_naive", as.data());
+    runTest("matrix_transpose_local_bad_banks", as.data());
+    runTest("matrix_transpose_local_good_banks", as.data());
 
     return 0;
+
 }