forked from benshope/PyOpenCL-Tutorial
-
Notifications
You must be signed in to change notification settings - Fork 0
/
030_timing.py
48 lines (43 loc) · 2.74 KB
/
030_timing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# Test the speed of your PyOpenCL program
from time import time # Import time tools
import pyopencl as cl # Import the OpenCL GPU computing API
import numpy as np # Import number tools
a = np.random.rand(1000).astype(np.float32) # Create a random array to add
b = np.random.rand(1000).astype(np.float32) # Create a random array to add
def cpu_array_sum(a, b): # Sum two arrays on the CPU
c_cpu = np.empty_like(a) # Create the destination array
cpu_start_time = time() # Get the CPU start time
for i in range(1000):
for j in range(1000): # 1000 times add each number and store it
c_cpu[i] = a[i] + b[i] # This add operation happens 1,000,000 times XXX
cpu_end_time = time() # Get the CPU end time
print("CPU Time: {0} s".format(cpu_end_time - cpu_start_time)) # Print how long the CPU took
return c_cpu # Return the sum of the arrays
def gpu_array_sum(a, b):
context = cl.create_some_context() # Initialize the Context
queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE) # Instantiate a Queue with profiling (timing) enabled
a_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=a)
b_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=b)
c_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, b.nbytes) # Create three buffers (plans for areas of memory on the device)
program = cl.Program(context, """
__kernel void sum(__global const float *a, __global const float *b, __global float *c)
{
int i = get_global_id(0);
int j;
for(j = 0; j < 1000; j++)
{
c[i] = a[i] + b[i];
}
}""").build() # Compile the device program
gpu_start_time = time() # Get the GPU start time
event = program.sum(queue, a.shape, None, a_buffer, b_buffer, c_buffer) # Enqueue the GPU sum program XXX
event.wait() # Wait until the event finishes XXX
elapsed = 1e-9*(event.profile.end - event.profile.start) # Calculate the time it took to execute the kernel
print("GPU Kernel Time: {0} s".format(elapsed)) # Print the time it took to execute the kernel
c_gpu = np.empty_like(a) # Create an empty array the same size as array a
cl.enqueue_read_buffer(queue, c_buffer, c_gpu).wait() # Read back the data from GPU memory into array c_gpu
gpu_end_time = time() # Get the GPU end time
print("GPU Time: {0} s".format(gpu_end_time - gpu_start_time)) # Print the time the GPU program took, including both memory copies
return c_gpu # Return the sum of the two arrays
cpu_array_sum(a, b) # Call the function that sums two arrays on the CPU
gpu_array_sum(a, b) # Call the function that sums two arrays on the GPU