-
Notifications
You must be signed in to change notification settings - Fork 0
/
host.cpp
166 lines (141 loc) · 6.6 KB
/
host.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
/**
* Copyright (C) 2019-2021 Xilinx, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may
* not use this file except in compliance with the License. A copy of the
* License is located at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
/*******************************************************************************
Description:
This is a matrix multiplication which showcases the "Systolic Array" based
algorithm design. Systolic array type of implementation is well suited for
FPGAs. It is a good coding practice to convert base algorithm into Systolic
Array implementation if it is feasible to do so.
*******************************************************************************/
#include "xcl2.hpp"
#include <vector>
// Array Size to access
#define DATA_SIZE 16
// Maximum Array Size
#define MAX_SIZE 16
// Software implementation of Matrix Multiplication
// The inputs are of the size (DATA_SIZE x DATA_SIZE)
void m_softwareGold(std::vector<int, aligned_allocator<int> >& in1, // Input Matrix 1
std::vector<int, aligned_allocator<int> >& in2, // Input Matrix 2
std::vector<int, aligned_allocator<int> >& out // Output Matrix
) {
// Perform Matrix multiply Out = In1 x In2
for (int i = 0; i < DATA_SIZE; i++) {
for (int j = 0; j < DATA_SIZE; j++) {
for (int k = 0; k < DATA_SIZE; k++) {
out[i * DATA_SIZE + j] += in1[i * DATA_SIZE + k] * in2[k * DATA_SIZE + j];
}
}
}
}
int main(int argc, char** argv) {
if (argc != 2) {
std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl;
return EXIT_FAILURE;
}
std::string binaryFile = argv[1];
// Allocate Memory in Host Memory
if (DATA_SIZE > MAX_SIZE) {
std::cout << "Size is bigger than internal buffer size, please use a "
"size smaller than "
<< MAX_SIZE << "!" << std::endl;
return EXIT_FAILURE;
}
size_t matrix_size = DATA_SIZE * DATA_SIZE;
size_t matrix_size_bytes = sizeof(int) * matrix_size;
cl_int err;
cl::CommandQueue q;
cl::Context context;
cl::Kernel krnl_systolic_array;
std::vector<int, aligned_allocator<int> > source_in1(matrix_size);
std::vector<int, aligned_allocator<int> > source_in2(matrix_size);
std::vector<int, aligned_allocator<int> > source_hw_results(matrix_size);
std::vector<int, aligned_allocator<int> > source_sw_results(matrix_size);
// Create the test data and Software Result
for (size_t i = 0; i < matrix_size; i++) {
source_in1[i] = i % 10;
source_in2[i] = i % 10;
source_sw_results[i] = 0;
source_hw_results[i] = 0;
}
// OPENCL HOST CODE AREA START
auto devices = xcl::get_xil_devices();
// read_binary_file() is a utility API which will load the binaryFile
// and will return the pointer to file buffer.
auto fileBuf = xcl::read_binary_file(binaryFile);
cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}};
bool valid_device = false;
for (unsigned int i = 0; i < devices.size(); i++) {
auto device = devices[i];
// Creating Context and Command Queue for selected Device
OCL_CHECK(err, context = cl::Context(device, nullptr, nullptr, nullptr, &err));
OCL_CHECK(err, q = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err));
std::cout << "Trying to program device[" << i << "]: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
cl::Program program(context, {device}, bins, nullptr, &err);
if (err != CL_SUCCESS) {
std::cout << "Failed to program device[" << i << "] with xclbin file!\n";
} else {
std::cout << "Device[" << i << "]: program successful!\n";
OCL_CHECK(err, krnl_systolic_array = cl::Kernel(program, "mmult", &err));
valid_device = true;
break; // we break because we found a valid device
}
}
if (!valid_device) {
std::cout << "Failed to program any device found, exit!\n";
exit(EXIT_FAILURE);
}
// Allocate Buffer in Global Memory
OCL_CHECK(err, cl::Buffer buffer_in1(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, matrix_size_bytes,
source_in1.data(), &err));
OCL_CHECK(err, cl::Buffer buffer_in2(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, matrix_size_bytes,
source_in2.data(), &err));
OCL_CHECK(err, cl::Buffer buffer_output(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, matrix_size_bytes,
source_hw_results.data(), &err));
int a_row = DATA_SIZE;
int a_col = DATA_SIZE;
int b_col = DATA_SIZE;
OCL_CHECK(err, err = krnl_systolic_array.setArg(0, buffer_in1));
OCL_CHECK(err, err = krnl_systolic_array.setArg(1, buffer_in2));
OCL_CHECK(err, err = krnl_systolic_array.setArg(2, buffer_output));
OCL_CHECK(err, err = krnl_systolic_array.setArg(3, a_row));
OCL_CHECK(err, err = krnl_systolic_array.setArg(4, a_col));
OCL_CHECK(err, err = krnl_systolic_array.setArg(5, b_col));
// Copy input data to device global memory
OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_in1, buffer_in2}, 0 /* 0 means from host*/));
// Launch the Kernel
OCL_CHECK(err, err = q.enqueueTask(krnl_systolic_array));
q.finish();
// Copy Result from Device Global Memory to Host Local Memory
OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_output}, CL_MIGRATE_MEM_OBJECT_HOST));
q.finish();
// OPENCL HOST CODE AREA END
// Compute Software Results
m_softwareGold(source_in1, source_in2, source_sw_results);
// Compare the results of the Device to the simulation
int match = 0;
for (int i = 0; i < DATA_SIZE * DATA_SIZE; i++) {
if (source_hw_results[i] != source_sw_results[i]) {
std::cout << "Error: Result mismatch" << std::endl;
std::cout << "i = " << i << " CPU result = " << source_sw_results[i]
<< " Device result = " << source_hw_results[i] << std::endl;
match = 1;
break;
}
}
std::cout << "TEST " << (match ? "FAILED" : "PASSED") << std::endl;
return (match ? EXIT_FAILURE : EXIT_SUCCESS);
}