Update main_matrix_multiplication.cpp

Smart781 · Oct 6, 2024 · 77fc1be · 77fc1be
1 parent d161290
commit 77fc1be
Showing 1 changed file with 14 additions and 20 deletions.
diff --git a/src/main_matrix_multiplication.cpp b/src/main_matrix_multiplication.cpp
@@ -19,7 +19,7 @@ const size_t gflops = ((size_t) M * K * N * 2) / (1000 * 1000 * 1000); // умн
 
 std::vector<float> computeCPU(const float *as, const float *bs)
 {
-    std::vector<float> cs(M*N, 0);
+    std::vector<float> cs(M * N, 0);
 
     timer t;
     for (int iter = 0; iter < benchmarkingItersCPU; ++iter) {
@@ -50,29 +50,26 @@ struct KernelConfig {
 
 KernelConfig makeNaiveConfig(unsigned int tile_size)
 {
-    throw std::runtime_error("not implemented");
     std::string kernel_name = "matrix_multiplication_naive";
-    gpu::WorkSize work_size(0, 0/*TODO*/);
+    gpu::WorkSize work_size(tile_size, tile_size, M / tile_size, N / tile_size);
     std::string defines;
     std::string prefix = "[naive, ts=" + std::to_string(tile_size) + "]";
     return KernelConfig{kernel_name, work_size, defines, prefix};
 }
 
 KernelConfig makeLocalConfig(unsigned int tile_size)
 {
-    throw std::runtime_error("not implemented");
     std::string kernel_name = "matrix_multiplication_local";
-    gpu::WorkSize work_size(0, 0/*TODO*/);
+    gpu::WorkSize work_size(tile_size, tile_size, M / tile_size, N / tile_size);
     std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size);
     std::string prefix = "[local, ts=" + std::to_string(tile_size) + "]";
     return KernelConfig{kernel_name, work_size, defines, prefix};
 }
 
 KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt)
 {
-    throw std::runtime_error("not implemented");
     std::string kernel_name = "matrix_multiplication_local_wpt";
-    gpu::WorkSize work_size(0, 0/*TODO*/);
+    gpu::WorkSize work_size(tile_size / wpt, tile_size, M / wpt, N);
     std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size) + " -DWORK_PER_THREAD=" + std::to_string(wpt);
     std::string prefix = "[local wpt, ts=" + std::to_string(tile_size) + ", wpt=" + std::to_string(wpt) + "]";
     return KernelConfig{kernel_name, work_size, defines, prefix};
@@ -81,12 +78,12 @@ KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt)
 void runTest(const KernelConfig &config, const float *as, const float *bs, const float *cs_cpu_reference)
 {
     gpu::gpu_mem_32f as_gpu, bs_gpu, cs_gpu;
-    as_gpu.resizeN(M*K);
-    bs_gpu.resizeN(K*N);
-    cs_gpu.resizeN(M*N);
+    as_gpu.resizeN(M * K);
+    bs_gpu.resizeN(K * N);
+    cs_gpu.resizeN(M * N);
 
-    as_gpu.writeN(as, M*K);
-    bs_gpu.writeN(bs, K*N);
+    as_gpu.writeN(as, M * K);
+    bs_gpu.writeN(bs, K * N);
 
     ocl::Kernel matrix_multiplication_kernel(matrix_multiplication, matrix_multiplication_length, config.kernel_name, config.defines);
     matrix_multiplication_kernel.compile();
@@ -101,8 +98,8 @@ void runTest(const KernelConfig &config, const float *as, const float *bs, const
     std::cout << "    GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
     std::cout << "    GPU: " << gflops / t.lapAvg() << " GFlops" << std::endl;
 
-    std::vector<float> cs(M*N, 0);
-    cs_gpu.readN(cs.data(), M*N);
+    std::vector<float> cs(M * N, 0);
+    cs_gpu.readN(cs.data(), M * N);
 
     // Проверяем корректность результатов
     double diff_sum = 0;
@@ -130,9 +127,9 @@ int main(int argc, char **argv)
     context.init(device.device_id_opencl);
     context.activate();
 
-    std::vector<float> as(M*K, 0);
-    std::vector<float> bs(K*N, 0);
-    FastRandom r(M+K+N);
+    std::vector<float> as(M * K, 0);
+    std::vector<float> bs(K * N, 0);
+    FastRandom r(M + K + N);
     for (unsigned int i = 0; i < as.size(); ++i) {
         as[i] = r.nextf();
     }
@@ -143,9 +140,6 @@ int main(int argc, char **argv)
 
     const std::vector<float> cs_cpu_reference = computeCPU(as.data(), bs.data());
 
-    // TODO uncomment
-    return 0;
-
     runTest(makeNaiveConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
     runTest(makeNaiveConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
     runTest(makeNaiveConfig(16), as.data(), bs.data(), cs_cpu_reference.data());