-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSlides-part1.tex
300 lines (237 loc) · 8.94 KB
/
Slides-part1.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\talksection{SPMD Execution Model}
\begin{frame}{SPMD Execution Model}
\begin{itemize}
%\item Data-parallel
%\begin{itemize}
% \item Work needs to be divided into chunks
%\end{itemize}
\item Single program
\begin{itemize}
\item Scalar form, but implicit SIMD execution
\end{itemize}
\item Multiple instances running in parallel
\begin{itemize}
\item Each instance working on a different data chunk
\end{itemize}
\item On GPU
\begin{itemize}
\item Divide work between lanes of SIMD units (fine division)
\item SIMD execution in lockstep
\end{itemize}
\item On CPU
\begin{itemize}
\item Divide work between cores (coarse division)
\item Sequential execution within a core (naive approach)
\end{itemize}
\end{itemize}
%\begin{itemize}
% \item Simplistic example:
% \begin{itemize}
% \item Massive Online Course
% \item Compute the overall grade (GPA) of millions of students
% \item GPA is the weighted average of several grades
% \end{itemize}
%\end{itemize}
\end{frame}
%% Kernel function has no return value, takes in buffers (arrays)
%% Vectorization does not change the signature for kernels
%% Can also vectorize "normal" functions, but this results in signature changes
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{Single Program}
\begin{itemize}
\item Kernel function
\begin{itemize}
\item Entry point for the computation
\end{itemize}
\item Executed once per work-item
\begin{itemize}
\item As if there was a loop around it (but no dependency between iterations)
\item Access to the iteration counter using \texttt{\varying{get\_global\_id(0)}}
\end{itemize}
\end{itemize}
\vspace{1ex}
\begin{minipage}[t]{0.45\linewidth}
\begin{codebox}[commandchars=\\\[\]]
kernel void add_uniform(int *dst,
int *src,
int alpha) {
int \varying[tid] = \varying[get_global_id(0)];
dst\idx[\varying[tid]] = src\idx[\varying[tid]] + (alpha - 1);
}
\end{codebox}
\end{minipage}\hspace{1em}\begin{minipage}[t]{0.49\linewidth}
\begin{codebox}[commandchars=\\\[\]]
int *dst = ...;
int *src = ...;
int alpha = ...;
for (int \varying[tid] = 0; \varying[tid] < num_items; \varying[tid]++) {
dst\idx[\varying[tid]] = src\idx[\varying[tid]] + (alpha - 1);
}
\end{codebox}
\end{minipage}
%\begin{codebox}
%kernel void calc_gpa(*float result, *int grades, *float weights,
% int num_grades, int num_students) {
% int student_id = get_global_id(0);
% float gpa = 0.0;
% for (int i = 0; i < num_grades; i++) {
% int grade = grades[(i * num_students) + student_id];
% float weight = weights[i];
% gpa += (grade * weight);
% }
% result[student_id] = gpa;
%}
%\end{codebox}
\end{frame}
\begin{frame}{Division of Work}
\begin{itemize}
\item Work-item
\begin{itemize}
\item Unit of work
\item One instance of a program
\item Executed in parallel by Execution Units (threads)
\end{itemize}
\item Iteration space
\begin{itemize}
\item 1D (array shape), used in this tutorial
\item 2D (grid shape)
\item ...
\end{itemize}
\end{itemize}
\vspace{4ex}
\hspace{1em}\includegraphics[scale=0.50]{images/work-items.pdf}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\talksection{Vectorization}
\begin{frame}{Why Vectorize?}
\begin{itemize}
\item Many executions units each executing one instance of a single program
\begin{itemize}
\item Works well on GPU (many hardware threads)
\item Not so much on CPU (very few cores)
\item CPU has to execute many work-items sequentially
\end{itemize}
\item Speed up this sequential computation using SIMD units
\begin{itemize}
\item Vertical Vectorization
\item Horizontal Vectorization
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{Vertical Vectorization}
\begin{itemize}
\item Patterns within a single work-item
\begin{itemize}
\item e.g. loops within a kernel
%\item In our example, the weighted average computation
\end{itemize}
\item Using the LLVM Loop Vectorizer or SLP Vectorizer
\item However, not all kernels contain vectorizable patterns
\end{itemize}
\vspace{5ex}
\hspace{1em}\includegraphics[scale=0.40]{images/vertical-vectorization.pdf}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{Horizontal Vectorization}
\begin{itemize}
\item Across work-items
\begin{itemize}
\item Compute multiple work-items at the same time
\item Take advantage of the execution model (single program, multiple data)
\end{itemize}
\item Does not depend on special code patterns like loops
\item SPMD Vectorizer
\end{itemize}
\vspace{3ex}
\hspace{0.55em}\includegraphics[scale=0.40]{images/horizontal-vectorization.pdf}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{SPMD Vectorizer}
\begin{minipage}[t]{0.45\linewidth}
\begin{itemize}
\item Vectorizes a SPMD program's entry point function
\item Given a function $F$ and vectorization factor $N$, produces a function $VF_N$
\begin{itemize}
\item Calling $VF_N$ is like calling $F$, but $N$ times (on consecutive work-items)
\item $F$ and $VF_N$ have the same signature
\end{itemize}
%\item The original function can be preserved (cloned) or not (in-place vectorization)
%\begin{itemize}
% \item Working on cloned function allows vectorization to fail gracefully
%\end{itemize}
\item Vectorization may be allowed to fail
\begin{itemize}
\item Work on cloned function
\item Use original function on failure
\end{itemize}
\end{itemize}
\end{minipage}
\hspace{1em}
\begin{minipage}[t]{0.48\linewidth}
\vspace{-0.1ex}
\begin{tcolorbox}
\includegraphics[width=\textwidth]{images/F-vs-VFn.pdf}
\end{tcolorbox}
\end{minipage}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\begin{frame}{Vectorizer Comparison}
%
%\begin{itemize}
% \item Loop Vectorizer
% \begin{itemize}
% \item Can be used for both vertical and horizontal vectorization
% \item Has to enforce dependencies between loop iterations
% \item Execution order is not specified, this is not needed
% \item Nested control-flow?
% \end{itemize}
%
% \item SLP Vectorizer
% \begin{itemize}
% \item Finds groups of similar scalar instructions (same opcode)
% \item Vertical vectorization
% \item Not all kernels contain this kind of code
% \end{itemize}
%
% \item SPMD Vectorizer
% \begin{itemize}
% \item Only supports horizontal vectorization
% \item Nested control-flow
% \item Not limited to a certain 'style' of code
% \end{itemize}
%\end{itemize}
%
%\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{Implementation Level: IR or MI?}
\begin{itemize}
\item IR
\\ \pointfor{Use-def graph and RAUW make for straightforward graph transformations}
\\ \pointfor{Easy to target multiple platforms}
\\ \pointunkn{Generally higher-level (simpler implementation?)}
\\ \pointagainst{Platform-specific features more difficult to use}
\\ \pointagainst{SIMD predication only for a few operations (select, load/stores)}
\item MachineInstr (i.e. backend level)
\\ \pointfor{Easy to use platform-specific features (e.g. predication, mask registers)}
\\ \pointunkn{Generally lower-level (more powerful?)}
\\ \pointagainst{More platform-specific code}
\\ \pointagainst{Graph-based transformations not as straightforward}
\item Both?
\begin{itemize}
\item Transformations at IR level, generating CFG-specific metadata
\item Use metadata in backend to do MI-level predication
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{Glossary}
\glossaryword{Work-item}{Unit of computation to execute in parallel}
\glossaryword{Instance}{State associated with one work-item}
\glossaryword{SIMD Lane}{Execution of one instance of a vectorized kernel}
\glossaryword{SIMD Group}{Contains all lanes that can be executed in parallel (at the same time)}
\glossaryword{SIMD Width}{Number of parallel lanes, equal to vectorization factor ($N$)}
\glossaryword{Packet}{Maps $1$ value in the original function to $N$ values, one per SIMD lane}
\end{frame}