-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathconfig.hh
321 lines (281 loc) · 12.6 KB
/
config.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
#ifndef _CONFIG_H
#define _CONFIG_H
#include "asic_core.hh"
#include "common.hh"
class asic_core;
class asic;
class network;
class stats;
class config;
class config
{
friend class asic;
public:
config(asic *host);
bool is_async();
bool is_sync();
bool is_vector();
bool is_slice();
bool is_non_frontier();
bool no_swarm();
public:
// dynamic graphs
enum dyn_algo_type _dyn_algo;
enum dyn_gen_type _dyn_gen;
bool _update_batch = false;
bool _dyn_graph = false;
bool _seq_edge = false;
bool _inc_dyn = false;
bool _recomp_dyn = false;
// hybrid execution
int _slice_hrst = 0;
int _heuristic = 0;
bool _cache_no_switch = false;
bool _async_no_switch = false;
bool _hybrid_eval = false;
bool _hybrid = false; /*not sure*/
// stats
bool _detailed_stats = false;
bool _reuse_stats = false;
bool _profiling = 0;
// these are the options to pick a spatial partitioning policy (only one of them should be set at a time)
enum spatial_part_type _spatial_part;
bool _blocked_dfs = false;
bool _dfsmap = false;
bool _bfs_map = 0;
bool _modulo = 0;
bool _linear = 0;
bool _bdfs = 0;
bool _bbfs = 0;
bool _random_spatial = 0;
bool _noload = 0;
int _bd = 1000;
// these are crossbar features -- do not apply to PolyGraph's final design.
int _crossbar_lat = 0;
int _crossbar_bw = 1;
bool _model_net_delay = 0;
int _network = 1;
bool _hrc_xbar = false;
// these implement different kinds of network optimizations
// 1. decomposable: multiple scalar networks
// 2. worst_net: single scalar network
// 3. ideal_net: this is depricated
enum net_traffic_type _net_traffic;
bool _decomposable = 0;
bool _worst_net = 0;
bool _ideal_net = 0;
// these implements two kinds of multicast:
// 1. real_multicast: this is the traditional algorithm where the packet is replicated when different destinations prefers different directions to minimize hops
// 2. path_multicast: this is proposed in GraphVine where the packet is not replicated and traverses to minimize hops for only the destination at the top
bool _real_multicast = 0;
bool _path_multicast = 0;
// these configurations are used for ideal experiments
// with perfect_net, all packets injected into the network reach their destination without any bandwidth or latency constraints
// with perfect_lb, new update packet is sent to the next available bank irrespective of data mapping.
bool _perfect_net = 0;
bool _perfect_lb = 0;
// these configurations are for the computation datapath
// this is used for vector workloads where latency to perform computation may be large
// this latency is used to resolve bank conflicts (conflicting requests are delayed by 8 cycles)
int _dfg_length = 8;
// this is the vector width of "process" computation
int _process_thr = 1;
// this implies the pull/push variant in graph processing
bool _pull_mode = 0;
// memory controller
int _pred_thr = 0;
// if set to 0, it will do atomic simulation of main memory where response is received using certain latency and bandwidth constraints.
// if set to 1, it will use DRAMSim2 to simulate main memory
bool _dramsim_enabled = 0;
// it models request and response traffic from the requesting core to the corresponding memory bank. It becomes significant when dealing with large scale systems.
bool _numa_contention = 0;
// if set, it uses synchronization on the returned edge data. It will wait for all edge data to arrive, so it can batch them for multicast.
// if reset, each edge would be requested as a separate data element.
bool _all_edge_access = false;
bool _cache_hit_aware_sched = 0;
bool _mult_cache_hit_aware_sched = 0;
bool _hats = false;
bool _hybrid_hats_cache_hit = false;
enum domain_type _domain = graphs;
bool _hash_join = false;
bool _tree_trav = false;
// cache
enum cache_repl_type _cache_repl;
// is set, each core gets a private cache.
// otherwise, there will be shared cache across all cores.
bool _banked = 0;
// this is depricated
bool _all_cache = 0;
// this is set when a scratchpad can be configured to a cache
// it implements dynamic switching in the slice scheduling variant
bool _resizing = 0;
// it sets whether to use LRU or MRU replacement policy
bool _lru = 0;
// if unset, we would like to cache vertex property only
// if set, we would like to cache edge data as well
bool _edge_cache = 0;
bool _special_cache = 0; // TODO: this should be combined, separate modules?!!
// if set, the simulation will use cache mode throughout the algorithm
bool _working_cache = 0;
// if set, caches are not updated in case of a miss
// this is deprecated -- may not be correct implementation
bool _phi = 0;
// these model the hypothetical cases where all are hits or misses in the cache
bool _allhits = 0;
bool _allmiss = 0;
// these represent the number of temporal slices of a graph
int _slice_count = 4;
// this applies only to graph-synchronous and slice-synchronous variants
// the parameters represent the number of times a slice should be executed before switching to next slice
// it is the parameter for locality slice scheduling variant
int _slice_iter = 1;
// this implies random spatial partitioning scheme: this is worst case without any preprocessing
bool _random = 0; // TODO: not sure if this works!!
// this specifies whether to use metis partitioning results
bool _metis = 0;
// task controller
// this implements an old optimization that implements a kind of "local" work-stealing
// If the task queue is empty, it can steal data from the overflow buffer of any of the nearby cores
// nearby cores help to not lose locality too much while maintaining load balance
bool _dyn_load_bal = false;
// this function implements the load balance on the source vertex side
// edge data returned from memory is uniformly distributed across cores irrespective of which source cores the request originated from.
bool _edge_load_bal = false;
// if set, it inserts limit on the parallelization of the computation datapath
// this includes task queue throughput and vector width of the prefetch and process units
bool _prac = 0;
// this is an old optimization where we implemented propagation blocking
// vector updates are pushed into the aggregation buffer at the rate of 'vector_width_updates/cycle'. Then, this buffer is drained at the rate of task_queue_enqueue_throughput
// This parameter specifies whether aggregation buffer shoud be popped in fifo fashion or priority order
bool _presorter = false;
// this is deprecated
bool _reorder = 0; // FIXME: not sure!!
// types of scheduling policies
// this implements the work-efficiency optimized slice scheduling algorithm variant
bool _abcd = false;
// this is deprecated!
bool _distance_sched = 1;
// if set, it implement the creation order vertex scheduling algorithm variant
// if unset, this implements the work-efficiency optimized vertex scheduling algorithm variant
bool _fifo = 0;
// these implement different kinds of vertex coalescing policies
// this coalesces the new active vertex if it has already been activated and not pushed into the hardware task queue yet (see coalescing in the PolyGraph paper)
bool _entry_abort = 0;
// this is deprecated but the idea was similar to GraphPulse paper where new updates to the same vertex may be performed separately without doing scratchpad access
// the implications are similar to vertex coalescing except here instead of dropping, "reduce" computations like min needs to be performed.
bool _update_coalesce = 0;
// this is deprecated, it was used earlier when task queues used source vertex property from the original property
// but in new design, we read updated vertex properties when pushing tasks from the overflow queue to the hardware task queue (TODO (@vidushi): confirm!)
bool _abort = 0;
bool _central_batch = true;
// this specifies the number of reserved entries in task queue for high priority tasks
int _high_prio_reserve = 0;
// taxonomy: this way will help us cover all workloads -- Do we want to
// change taxonomy somehow?
enum update_visibility _update_visible;
enum task_sched_type _task_sched;
enum slice_sched_type _slice_sched; // abcd, round-robin, reuse
// graph specific details
// this is used to pick an algorithm variant automatically
int _graph_dia = 1;
// this is used when preprocessing the graph (in preprocess.cpp)
bool _undirected = 0;
bool _unweighted = 0;
bool _csr = 0;
// algorithm specific details
// this should be true for all dense frontier algorithms
bool _pr = false;
// if true, BFS is executed. if false, SSSP is executed.
// Please note that both BFS and PR should not true, otherwise they will lead to undefined behavior.
bool _abfs = false;
// this represents connected components
bool _acc = false;
// this is deperecated. but it is a special shortest path where the source vertices also have a weight.
bool _astar = false;
// this should be true for all vector graph algorithms.
bool _cf = false;
// For GCN, both this and CF should be true.
// For CF, only CF sould be true.
bool _gcn = false;
// this implies triangle counting
bool _tc = false;
// it can be true only with GCN
// Ladies GCN is a graph sampling strategy that will prune the graph to the assigned number of nodes
// the resulting graph is much more dense compared to the original graph as this algorithm keeps high degree vertices around
bool _ladies_gcn = false;
// this implies whether we would like to simulate the matrix-multiplication phase of GCN
// Please note that this simulator is based on graph's vertex processing template.
// Therefore, it analytically adds fixed number of cycles to model matrix-multiplication.
bool _gcn_matrix = false;
// this is deprecated but it was defined to simulate synchronous algorithms where new vertices should be activated during a synchronization phase
bool _sync_sim = 0;
// these are algorithm parameters
// this is source vertex for single source shortest path algorithm
int _src_loc = 0;
// this implies number of layers in the GCN model
int _gcn_layers = 0;
// this implies feature length in GCN or CF
int _feat_len = 8;
// deprecated
// this was implemented for vector algorithms when "reduce" is commutative and order across elements of the vector is not desired.
// otherwise a reorder buffer will be required to reorder data from the network.
bool _sgu_gcn_reorder = 0; /*FIXME: do we need this? or is it working?*/
int _batched_cores = 1;
// gcn
enum sync_type _agg_mult_type = global;
enum sync_type _mult_agg_type = global;
bool _heter_cores = 0;
bool _graphmat_cf = 0;
// deprecated
bool _gcn_swarm = 0;
bool _cf_swarm = 0;
int _lazy_cycles = 1000000000;
int _dep_check_depth = 4;
bool _chronos = 0;
// algorithm optimizations
bool _sync_scr = 0; // TODO: something like depth-wise GCN
bool _dyn_reuse = 0;
bool _extreme = 0; // FIXME: required?
// deprecated?
int _reuse = 1;
// execution model
enum exec_model_type _exec_model;
// this is asynchronous with no-slice algorithm variant
bool _sgu = 0;
// this is asynchronous with slicing algorithm variant
bool _sgu_slicing = 0;
// this is graph-synchronous or slice-synchronous algorithm variant with round-robin slicing
bool _graphmat_slicing = 0;
// this is graph-synchronous with no-slice algorithm variant
bool _graphmat = 0;
// this is slice-synchronous with with locality slice algorithm variant
bool _blocked_async = 0;
// these are deprecated
bool _swarm = 0;
bool _espresso = 0;
bool _graphlab = 0;
bool _sgu_hybrid = 0;
bool _tesseract = 0;
bool _preprocess = 0;
// simulation details
enum net_type _net;
enum algo_type _algo;
bool _prio_xbar = false; // models a case with priority ordering in bank queues
int _core_cnt = 16; // total number of cores
int _num_rows = 4; // number of rows in the mesh
int _num_banks = 32; // number of scratchpad banks
int _l2size = 4096; // l2 cache size
int _task_queue_size = 512; // local task queue size
int _bus_width = 16; // network width is 16 bytes
int _num_tq_per_core = 1; // maintain 1 core with throughput of 1 enqueue/dequeue per 2 cycles
bool _anal_mode = 0; // analysis mode, skip before simulation
int _lane_width = 1; // number of independent lanes in a core, deprecated
int _mem_bw = 256; // maximum memory bandwidth, used for atomic simulation of memory
int _scr_bw = 256; // deprecated, number of banks indicate scratchpad bandwidth
bool _prefer_tq_latency = false;
bool _inter_task_reorder = false;
enum work_dist_type _work_dist;
bool _work_stealing = 0; // TODO: not working...!!
asic *_asic;
};
#endif