-
Notifications
You must be signed in to change notification settings - Fork 1
/
vecgen.hpp
344 lines (289 loc) · 14.7 KB
/
vecgen.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
// Copyright 2013, Andrew Ross
// Distributable under the GNU LGPL v2.1 , see COPYING for details
#ifndef _VECGEN_HPP
#define _VECGEN_HPP
//
// Top-level code generation for vector SIMD code. A vecgen object
// can create vr ("virtual register") objects representing known
// storage and compose them together in C++ expressions that will
// automatically generate code to evaluate that expression. The
// resulting code (after copying to executable memory) can be invoked
// through a C function pointer with known signature.
//
#include <list>
#include <string>
#include <vector>
#include <map>
#include <deque>
#include <cstring>
enum op { NOP=0, FADD, FSUB, FMUL, RCP, RSQ, MUX, FEQ, FNE, FLT, FGT, FLE, FGE,
FLOOR, CEIL, F2I, I2F, AND, OR, ANDN, BITANDN, BITAND, BITOR, BITXOR, FMA, FMAX,
FMIN, MOV, LOAD, STORE, IF, ELSE, FI, LOOP, BREAK, POOL, CULL, CULL_FLD, DEBUG_LOG };
enum memsz { U8, S8, U16, S16, U32 };
class vecgen;
// "Virtual register". These are C++ objects that can be passed
// around and built into expressions using traditional syntax. Those
// expressions automatically generate their code when used as an
// rvalue in an assignment or as an argument to start_if().
class vr {
public:
// Default-constructed vr's become scratch registers on assignment
vr() { init(TMP); }
// Copy constructor produces a scratch register that gets assigned
// from the rhs unless (!) the rhs is the output of one of vecgen
// register factories (input/output/constant/immediate), which get
// cloned.
vr(const vr& v);
// Converts from float to a runtime immediate. Note: float, not
// int. Use vecgen::imm_i to get a bit pattern.
vr(float f) { init(IMM); val=f; }
vr operator+(const vr &a) const { return vrdup(vr(FADD, this, &a)); }
vr operator-(const vr &a) const { return vrdup(vr(FSUB, this, &a)); }
vr operator*(const vr &a) const { return vrdup(vr(FMUL, this, &a)); }
vr operator&(const vr &a) const { return vrdup(vr(BITAND, this, &a)); }
vr operator|(const vr &a) const { return vrdup(vr(BITOR, this, &a)); }
vr operator&&(const vr &a) const { return vrdup(vr(AND, this, &a)); }
vr operator||(const vr &a) const { return vrdup(vr(OR, this, &a)); }
vr operator^(const vr &a) const { return vrdup(vr(BITXOR, this, &a)); }
vr operator==(const vr &a) const { return vrdup(vr(FEQ, this, &a)); }
vr operator!=(const vr &a) const { return vrdup(vr(FNE, this, &a)); }
vr operator<(const vr &a) const { return vrdup(vr(FLT, this, &a)); }
vr operator<=(const vr &a) const { return vrdup(vr(FLE, this, &a)); }
vr operator>(const vr &a) const { return vrdup(vr(FGT, this, &a)); }
vr operator>=(const vr &a) const { return vrdup(vr(FGE, this, &a)); }
// A precision warning about recip/rsqrt: on AVX, these can return
// exact zeros for very large inputs (>= 2^126) instead of finite
// values.
friend vr recip(const vr &a) { return vrdup(vr(RCP, &a)); }
friend vr rsqrt(const vr &a) { return vrdup(vr(RSQ, &a)); }
friend vr andnot(const vr &a, const vr &b) { return vrdup(vr(ANDN, &a, &b)); }
friend vr bitandnot(const vr &a, const vr &b) { return vrdup(vr(BITANDN, &a, &b)); }
friend vr fma(const vr &a, const vr &b, vr c) { return vrdup(vr(FMA, &a, &b, &c)); }
friend vr floor(const vr &a) { return vrdup(vr(FLOOR, &a)); }
friend vr ceil(const vr &a) { return vrdup(vr(CEIL, &a)); }
friend vr f2i(const vr &a) { return vrdup(vr(F2I, &a)); }
friend vr i2f(const vr &a) { return vrdup(vr(I2F, &a)); }
friend vr max(const vr &a, const vr &b) { return vrdup(vr(FMAX, &a, &b)); }
friend vr min(const vr &a, const vr &b) { return vrdup(vr(FMIN, &a, &b)); }
friend vr mux(const vr &sel, const vr &a, const vr &b) { return vrdup(vr(MUX, &b, &a, &sel)); }
vr& operator=(const vr &a);
vr& operator+=(const vr &a) { return *this = *this + a; }
vr& operator-=(const vr &a) { return *this = *this - a; }
vr& operator*=(const vr &a) { return *this = *this * a; }
vr& operator&=(const vr &a) { return *this = *this & a; }
vr& operator|=(const vr &a) { return *this = *this | a; }
vr& operator^=(const vr &a) { return *this = *this ^ a; }
void setvg(vecgen *v) { vg = v; }
// Memory operations. Remember the i argument should be an
// integer, e.g. from f2i(). Note also that because the integer
// comes out of float math, it's limited to 24 bits of precision
// (i.e. 16/32/64MB maximum for 1/2/4 byte elements) unless
// special care is taken to composite integers via bitwise math.
friend vr load(memsz sz, int obj, const vr &i);
friend void store(memsz sz, int obj, const vr &i, const vr &val);
friend vr imm(float f);
friend vr imm_i(int i);
private:
friend class vecgen;
typedef enum { REG, OP, IMM, IMMI, TMP } vrmode;
// This exists to defeat the return value optimization. We want
// "vr a = b+c;" to produce "a" as a scratch register (via the
// copy constructor from "b+c"), not the FADD operation. But RVO
// is legally allowed to construct "b+c" directly in the memory
// allocated for a, which is not what we want.
friend vr vrdup(const vr &a) { return a; }
vr(op op, const vr *ap=0, const vr *bp=0, const vr *cp=0)
{ init(OP); o=op; copyvg(ap, bp, cp); a=ap; b=bp; c=cp; }
vr(op op, memsz sz, int obj, const vr *ap, const vr *bp=0)
{ init(OP); o=op; copyvg(ap, bp); imm0=sz; imm1=obj; a=ap; b=bp; }
vr(vecgen* v, int r) { init(REG); vg=v; rid=r; cloneme=true; }
void init(vrmode m) { memset(this, 0, sizeof(vr)); mode = m; }
void copyvg(const vr *a=0, const vr *b=0, const vr *c=0)
{ if(!vg) { vg = a->vg ? a->vg : (b->vg ? b->vg : c->vg); } }
vrmode mode;
int rid;
vecgen *vg;
const vr *a, *b, *c;
float val;
int ival;
op o;
int imm0, imm1;
bool cloneme;
};
inline vr imm(float f) { return vr(f); }
inline vr imm_i(int i) { vr v; v.mode = vr::IMMI; v.ival = i; return v; }
inline vr recip2(const vr &a) { vr b=recip(a); return (imm(2)-a*b)*b; }
inline vr rsqrt2(const vr &a) { vr b=rsqrt(a); return (b*0.5)*(imm(3)-a*b*b); }
template<class A, class B>
inline bool contains(const A &a, const B &b) { return a.find(b) != a.end(); }
// Trivial bidirectional id/value mapping for register files.
struct regmap {
int probe(int value, int& nextid) {
if(v2i.find(value) == v2i.end()) {
v2i[value] = nextid++;
i2v[v2i[value]] = value;
}
return v2i[value];
}
bool hasid(int id) { return i2v.find(id) != i2v.end(); }
int value(int id) { return i2v[id]; }
std::map<int,int> i2v, v2i;
};
struct insn {
op o;
int dst, a, b, c;
int imm0, imm1, imm2;
};
typedef void (*vecgen_fn)(float* consts, float* ins, float* outs, void **mems, float *msk);
class vecgen {
public:
vecgen(const char *opts=0);
// Register generators. Immediates are indexed and unified by
// their values; inputs and outputs are indexed arrays passed to
// the generated function. Temporary scratch registers can be
// created as needed, or will be created via the expression
// generators.
vr imm_i(int val) { return vr(this, gen_immediate(val)); }
vr imm(float f);
vr constant(int idx) { return vr(this, consts.probe(idx, next_id)); }
vr input(int idx) { return vr(this, inputs.probe(idx, next_id)); }
vr output(int idx) { return vr(this, outputs.probe(idx, next_id)); }
vr scratch() { return vr(this, nextid()); }
// Newton-Raphson refinements of the low-precision ops
vr recip2(const vr &a) { vr b=recip(a); return (imm(2)-a*b)*b; }
vr rsqrt2(const vr &a) { vr b=rsqrt(a); return (b*0.5)*(imm(3)-a*b*b); }
void start_loop() { emit(LOOP); flow.push_back(IN_LOOP); }
void break_loop();
void end_loop();
void start_if(vr &a) { gen(&a); emit(IF, 0, a.rid); flow.push_back(IN_IF); }
void start_else();
void end_if();
// Simple sequential allocation of memory array (the "mems"
// argument to the generated code) indexes. Different code
// generation modules need to be able to share this space without
// clobbering each other.
int alloc_memidx() { return nmems++; }
int n_mems() { return nmems; }
// Emit a scalar hsoa "slice" for each unmasked SIMD thread.
// Takes indices into the mem[] array, one of which is an aligned
// pointer to a presized array of hsoa records, the other is an
// integer indicating how many entries [0:N-1] in the current
// record are already filled. Both are mutated by the generated
// code and can thus be reused by another iteration. The intent
// of this API is to efficiently implement a "culling" step, where
// one SIMD task (say, a vertex shader) conditionally generates
// output records for another (a fragment shader).
int cull(vr* regs, int nregs, int memidx_ptr, int memidx_count);
// By default, cull regions are unbounded, they will continue
// writing to memory as long as there is new data. Bounds can be
// set at any time before codegen(). This takes three indexes
// into the mems[] array. The first is a "maximum" pointer value
// that the cull pointer is allowed to have. After it is full the
// function pointer stored at mi_callback (cdecl, signature
// "void(*)(void*)") will be invoked with the uninspected pointer
// value found at mi_cbdata. The callback might be expected to
// reset the ptr/count values with a new memory region, etc...
void set_cull_bound(int cullid, int mi_max, int mi_cbdata, int mi_callback);
// Insert a log message into the interpreter stream. Ignored in
// the generated code (except that it does affect optimizer
// behavior as it reads registers). Simply prints the message
// followed by the values of the 0-2 arguments.
void log(const char *msg, vr a=vr(), vr b=vr());
// The returned code can be invoked through a standard cdecl
// function pointer with the following signature:
//
// void code(const float* consts, const simd_t* ins, simd_t* outs, void **mems, simd_t *msk)
//
// All "simd_t" pointers should be aligned (i.e. 32 bytes for
// AVX). The scalar and memory pointers need alignment only if
// the platform requires it (x86 does not) or for performance
// reasons. Caller is responsible for sizing and allocating the
// output buffer in executable memory.
void codegen();
int code_size() { return native_code.size(); }
void get_code(char* buf) { std::memcpy(buf, &native_code[0], code_size()); }
// The generated code can be configured to iterate over an array
// of records. The count is passed in at call time via the mems[]
// array in the index specified. The input and output base
// pointers are incremented at each iteration by the specified
// strides (in units of a SIMD register, e.g. 32 bytes).
// FIXME: should this just be done via the opts string?
void set_count(int count_idx, int input_stride, int output_stride)
{ countidx = count_idx; instride = input_stride; outstride = output_stride; }
// Quick & dirty interpreter for the VM. Makes debugging the
// optimizer much easier and provides a good reference for test
// suites. Takes scalar (!) pointers: emulates logic, not SIMD.
// Also doesn't support counted iteration.
void interpret(float *consts, float *ins, float *outs, void **mems, FILE *log=0);
void dump_code(FILE* log);
// FIXME: used only by the unit test right now. If we expose
// these, it wants a better API than blindly expecting the user to
// tightly pack. Find the highest index and/or make the user set
// it explicitly?
int ninputs() { return inputs.i2v.size(); }
int noutputs() { return outputs.i2v.size(); }
bool is_immediate(const vr &v) { return is_imm(v.rid); }
private:
friend class vr;
friend class vecgen_avx;
friend class vecgen_opt;
friend vr load(memsz sz, int obj, const vr &i);
friend void store(memsz sz, int obj, const vr &i, const vr &val);
typedef std::list<insn>::iterator iptr;
void emit(op o, int dst=0, int a=0, int b=0, int c=0);
void emit(insn &i);
int load(memsz sz, int obj, int idx);
void store(memsz sz, int obj, int idx, int val);
void set_opts(const char* opts);
// Ensures the vr has a register ID and recursively emits code to
// get it the right value at runtime.
void gen(vr *v);
int rid(const vr &v) { return v.rid; }
int nextid() { return next_id++; }
int gen_immediate(int val);
void insert_before(iptr p, op o, int dst=0, int a=0, int b=0, int c=0);
iptr insert_after(iptr p, op o, int dst=0, int a=0, int b=0, int c=0);
void add_predication();
void add_forward_jumps();
union uif { int i; float f; };
bool interpret_op(op o, uif& dst, uif a, uif b, uif c);
void interpret_cull(uif &msk, iptr ip, std::map<int,uif> ®s, void **mems, FILE *log);
bool is_scratch(int r);
bool is_output(int r) { return outputs.hasid(r); }
bool is_imm(int r) { return immr2i.find(r) != immr2i.end(); }
int imm_val(int r) { return immrecs[immr2i[r]].val; }
static bool is_control(op o) { return o==IF || o==ELSE || o==FI || o==BREAK; }
const char* opt(const char* key);
int opt_i(const char* key);
std::string regname(int r);
static const char* opname(op o);
struct immrec { int rid, val; };
enum flowstate { TOP, IN_IF, IN_ELSE, IN_LOOP };
struct cullmeta { int mi_max, mi_data, mi_cb; };
regmap inputs, outputs, consts;
std::vector<immrec> immrecs;
std::map<int,int> immr2i;
std::deque<flowstate> flow;
int next_id;
int msk; // mask register
vr mskreg;
int num_culls;
std::list<insn> code;
std::vector<unsigned char> native_code;
std::map<std::string,std::string> opts;
std::map<int,cullmeta> cullbounds;
int countidx;
int instride, outstride;
std::vector<std::string> logmsgs;
int nmems;
};
// Helper macros that make block structure look more natural
inline int _startloop(vecgen& vg) { vg.start_loop(); return 1; }
inline int _endloop(vecgen& vg) { vg.end_loop(); return 0; }
inline int _startif(vecgen& vg, const vr &r) { vg.start_if(const_cast<vr&>(r)); return 1; }
inline int _startelse(vecgen& vg) { vg.start_else(); return 1; }
inline int _endif(vecgen& vg) { vg.end_if(); return 0; }
#define VRIF(vg, e) for(int _1st=_startif(vg, e); _1st; _1st=_endif(vg))
#define VRELSE(vg) for(int _1st=_startelse(vg); _1st; _1st=_endif(vg))
#define VRLOOP(vg) for(int _1st=_startloop(vg); _1st; _1st=_endloop(vg))
#endif // _VECGEN_HPP