Skip to content
This repository was archived by the owner on Jul 23, 2024. It is now read-only.

Commit 00f3916

Browse files
committed
add const cache linesize detect
1 parent bc4bbd5 commit 00f3916

File tree

4 files changed

+170
-50
lines changed

4 files changed

+170
-50
lines changed

README.md

+29-19
Original file line numberDiff line numberDiff line change
@@ -12,39 +12,49 @@
1212
2. `cmake .. && make`
1313
3. `python ../compile_sass.py -arch=<70|75|80>`
1414

15-
## Benchmark
15+
## Microbenchmark
1616

1717
### 1. Latency
1818

19-
|Device | | RTX-2070 |
20-
|:--------------------------:|:---------:|:---------:|
21-
|Global Latency |cycle | TBD |
22-
|L2 Latency |cycle | 236 |
23-
|L1 Latency |cycle | 32 |
24-
|Shared Latency |cycle | 23 |
25-
|Constant Latency |cycle | 448 |
26-
|Constant L2 Latency |cycle | 62 |
27-
|Constant L1 Latency |cycle | 4 |
19+
|Device | | Turing RTX-2070 |
20+
|:--------------------------:|:---------:|:----------------:|
21+
|Global Latency |cycle | TBD |
22+
|L2 Latency |cycle | 236 |
23+
|L1 Latency |cycle | 32 |
24+
|Shared Latency |cycle | 23 |
25+
|Constant Latency |cycle | 448 |
26+
|Constant L2 Latency |cycle | 62 |
27+
|Constant L1 Latency |cycle | 4 |
2828

2929
- const L1-cache is as fast as register.
3030

3131
### 2. Cache Linesize
3232

3333

34-
|Device | | RTX-2070 |
35-
|:--------------------------:|:---------:|:---------:|
36-
|L2 Linesise |bytes | TBD |
37-
|L1 Linesize |bytes | 32 |
38-
|Constant L2 Linesise |bytes | TBD |
39-
|Constant L1 Linesize |bytes | TBD |
34+
|Device | | Turing RTX-2070 |
35+
|:--------------------------:|:---------:|:----------------:|
36+
|L2 Linesise |bytes | 64 |
37+
|L1 Linesize |bytes | 32 |
38+
|Constant L2 Linesise |bytes | 256 |
39+
|Constant L1 Linesize |bytes | 32 |
4040

4141

4242

4343
### 3. Reg Bankconflict
4444

45-
| Instruction | conflict | without conflict |
46-
|:-----------:|:--------:|:----------------:|
47-
|FFMA | 1.484 | 1.758 |
45+
| Instruction | | conflict | without conflict |
46+
|:-----------:|:-------:|:--------:|:----------------:|
47+
|FFMA | CPI | 1.758 | 1.484 |
48+
49+
50+
### 4. Shared Bankconflict
51+
52+
| Memory Load | | Turing RTX-2070 |
53+
|:----------------------:|:---------:|:----------------:|
54+
| Single | cycle | 23 |
55+
| Vector2 X 2 | cycle | 27 |
56+
| Conflict Strided | cycle | 41 |
57+
| Conlict-Free Strided | cycle | 32 |
4858

4959

5060

memory/cache_linesize.cu

+52-20
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,39 @@
77
#include "utils.cuh"
88

99

10-
__global__ void linesizeDetectKernel(float* input, float* output, uint* clock){
10+
__constant__ float cinput[1024];
1111

12-
uint c[48];
12+
__global__ void linesizeDetectKernel(float* input, float* output, uint* clock, float* cinput){
13+
14+
uint c[256];
1315
float val = 0;
14-
float acc = 0;
1516

17+
float acc = 0;
1618
c[0] = getClock();
19+
#pragma unroll
20+
for (int i = 0; i < 256; ++i){
21+
asm volatile(
22+
"ld.global.cg.b32 %0, [%1]; \n\t"
23+
:"=f"(val):"l"(input):"memory"
24+
);
25+
c[i+1] = getClock();
26+
acc += val;
27+
input += 2;
28+
}
29+
#pragma unroll
30+
for (int i = 0; i < 256; ++i){
31+
clock[i] = c[i+1] - c[i];
32+
}
33+
output[0] = acc;
34+
35+
/////////////////////////////////////////////////////////////////////////
1736

37+
input += 1024;
38+
clock += 512;
39+
acc = 0;
40+
c[0] = getClock();
1841
#pragma unroll
19-
for (int i = 0; i < 32; ++i){
42+
for (int i = 0; i < 256; ++i){
2043
asm volatile(
2144
"ld.global.ca.f32 %0, [%1]; \n\t"
2245
:"=f"(val):"l"(input):"memory"
@@ -25,27 +48,34 @@ __global__ void linesizeDetectKernel(float* input, float* output, uint* clock){
2548
acc += val;
2649
input++;
2750
}
28-
2951
#pragma unroll
30-
for (int i = 0; i < 32; ++i){
52+
for (int i = 0; i < 256; ++i){
3153
clock[i] = c[i+1] - c[i];
3254
}
33-
output[0] = acc;
55+
output[1] = acc;
3456
}
3557

3658

37-
int detectCacheLinesize(uint* clock, int size){
38-
int l1_linesize = 0;
59+
int detectCacheLinesize(uint* clock, int size, uint gap){
60+
int linesize = 0;
3961
uint last_cycle = clock[0];
62+
63+
int first = 0;
64+
int second = 0;
65+
66+
// formatArray(clock, 256, 16);
4067
for (int i = 1; i < size; ++i){
41-
// printf("clock %d latency %u\n", i, clock[i]);
42-
if (clock[i] > last_cycle and clock[i] - last_cycle > 10) {
43-
l1_linesize = i * 4;
44-
break;
68+
if (clock[i] > last_cycle and clock[i] - last_cycle > gap) {
69+
if (first == 0){
70+
first = i;
71+
} else {
72+
second = i;
73+
break;
74+
}
4575
}
4676
last_cycle = clock[i];
4777
}
48-
return l1_linesize;
78+
return (second - first) * 4;
4979
}
5080

5181

@@ -72,25 +102,27 @@ int main(){
72102
dim3 gDim(1, 1, 1);
73103
dim3 bDim(1, 1, 1);
74104

75-
void* kernel_args[] = {&input_d, &output_d, &clock_d};
105+
void* kernel_args[] = {&input_d, &output_d, &clock_d, &cinput};
76106
const char* cubin_name = "../sass_cubin/cache_linesize.cubin";
77107
const char* kernel_name = "cacheLinesize";
78108

79109
launchSassKernel(cubin_name, kernel_name, gDim, bDim, 0, kernel_args);
80110
cudaMemcpy(clock_h, clock_d, sizeof(float) * size, cudaMemcpyDeviceToHost);
81111
cudaDeviceSynchronize();
82112
printf(">>> SASS Level Cache Linesize Result\n");
83-
// printf(" L2 Linesize \t= %3d B\n", clock_h[1]);
84-
printf(" L1 LineSize \t= %3u B\n", detectCacheLinesize(clock_h, size));
113+
printf(" Global L2 LineSize \t= %3u B\n", detectCacheLinesize(clock_h, 512, 40));
114+
printf(" Global L1 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 512, 512, 10));
115+
printf(" Constant L2 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 1024, 512, 100));
116+
printf(" Constant L1 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 1536, 512, 10));
85117

86118

87119

88-
linesizeDetectKernel<<<gDim, bDim>>>(input_d, output_d, clock_d);
120+
linesizeDetectKernel<<<gDim, bDim>>>(input_d, output_d, clock_d, cinput);
89121
cudaMemcpy(clock_h, clock_d, sizeof(float) * size, cudaMemcpyDeviceToHost);
90122
cudaDeviceSynchronize();
91123
printf("\n");
92124
printf(">>> CUDA-C Level Cache Linesize Result\n");
93-
// printf(" L2 Linesize \t= %3d B\n", clock_h[1]);
94-
printf(" L1 LineSize \t= %3u B\n", detectCacheLinesize(clock_h, size));
125+
printf(" Global L2 LineSize \t= %3u B\n", detectCacheLinesize(clock_h, 512, 40));
126+
printf(" Global L1 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 512, 512, 10));
95127
return 0;
96128
}

sass_cubin/cache_linesize.sass

+88-10
Original file line numberDiff line numberDiff line change
@@ -11,40 +11,118 @@ clock, 8
1111
3: output_hi
1212
4: clock_lo
1313
5: clock_hi
14-
6-48 ~ e<0-35>
15-
49: vA
16-
50-96 ~ c<0-35>
14+
6: vA
15+
7: e
16+
8-240 ~ c<0-200>
1717
</regs>
1818

19+
20+
<consts>
21+
const_a, 1024
22+
</consts>
23+
24+
25+
1926
--:-:-:-:2 MOV input_lo, input[0];
2027
--:-:-:-:2 MOV input_hi, input[1];
2128
--:-:-:-:2 MOV output_lo, output[0];
2229
--:-:-:-:2 MOV output_hi, output[1];
2330
--:-:-:-:2 MOV clock_lo, clock[0];
2431
--:-:-:-:4 MOV clock_hi, clock[1];
2532

26-
--:-:-:-:2 CS2R c0, SR_CLOCKLO;
2733

34+
--:-:-:-:2 CS2R c0, SR_CLOCKLO;
2835
<CODE>
2936
SASS_CODE = []
37+
loop_size = 200
38+
39+
LDG = "--:-:0:-:2 LDG.E.STRONG.GPU vA, [input_lo+{:}];"
40+
CS2R = "01:-:-:-:4 CS2R c{:}, SR_CLOCKLO;"
41+
IADD = "--:-:-:-:5 IADD3 e, c{:}, -c{:}, RZ;"
42+
STG = "--:-:-:-:4 STG.E.SYS [clock_lo+{:}], e;"
43+
44+
for i in range(loop_size):
45+
SASS_CODE += [LDG.format(hex(i * 4))]
46+
SASS_CODE += [CS2R.format(i+1)]
47+
48+
for i in range(loop_size):
49+
SASS_CODE += [IADD.format(i+1, i)]
50+
SASS_CODE += [STG.format(hex(i*4), i)]
51+
52+
out_ = "\n" + "\n".join(SASS_CODE) + "\n"
53+
</CODE>
54+
3055

31-
loop_size = 35
56+
57+
58+
59+
60+
--:-:-:-:2 IADD3 clock_lo, clock_lo, 0x800, RZ;
61+
--:-:-:-:2 CS2R c0, SR_CLOCKLO;
62+
63+
<CODE>
64+
SASS_CODE = []
65+
loop_size = 200
3266

3367
LDG = "--:-:0:-:2 LDG.E.STRONG.CTA vA, [input_lo+{:}];"
3468
CS2R = "01:-:-:-:4 CS2R c{:}, SR_CLOCKLO;"
35-
IADD = "--:-:-:-:5 IADD3 e{:}, c{:}, -c{:}, RZ;"
36-
STG = "--:-:-:-:4 STG.E.SYS [clock_lo+{:}], e{:};"
37-
BAR = "--:-:-:-:2 BAR.SYNC 0x0;"
69+
IADD = "--:-:-:-:5 IADD3 e, c{:}, -c{:}, RZ;"
70+
STG = "--:-:-:-:4 STG.E.SYS [clock_lo+{:}], e;"
3871

3972
for i in range(loop_size):
4073
SASS_CODE += [LDG.format(hex(i * 4))]
4174
SASS_CODE += [CS2R.format(i+1)]
4275

43-
SASS_CODE += [BAR]
4476
for i in range(loop_size):
45-
SASS_CODE += [IADD.format(i, i+1, i)]
77+
SASS_CODE += [IADD.format(i+1, i)]
78+
SASS_CODE += [STG.format(hex(i*4), i)]
79+
80+
out_ = "\n" + "\n".join(SASS_CODE) + "\n"
81+
</CODE>
82+
83+
84+
85+
86+
--:-:-:-:2 IADD3 clock_lo, clock_lo, 0x800, RZ;
87+
--:-:-:-:2 CS2R c0, SR_CLOCKLO;
88+
<CODE>
89+
SASS_CODE = []
90+
loop_size = 200
91+
92+
LDC = "--:-:0:-:2 LDC.E vA, const_a[{:}];"
93+
CS2R = "01:-:-:-:4 CS2R c{:}, SR_CLOCKLO;"
94+
IADD = "--:-:-:-:5 IADD3 e, c{:}, -c{:}, RZ;"
95+
STG = "--:-:-:-:4 STG.E.SYS [clock_lo+{:}], e;"
96+
97+
for i in range(loop_size):
98+
SASS_CODE += [LDC.format(i)]
99+
SASS_CODE += [CS2R.format(i+1)]
100+
101+
for i in range(loop_size):
102+
SASS_CODE += [IADD.format(i+1, i)]
103+
SASS_CODE += [STG.format(hex(i*4), i)]
104+
105+
out_ = "\n" + "\n".join(SASS_CODE) + "\n"
106+
</CODE>
107+
108+
109+
--:-:-:-:2 IADD3 clock_lo, clock_lo, 0x800, RZ;
110+
--:-:-:-:2 CS2R c0, SR_CLOCKLO;
111+
<CODE>
112+
SASS_CODE = []
113+
loop_size = 200
114+
115+
LDC = "--:-:0:-:2 MOV vA, const_a[{:}];"
116+
CS2R = "01:-:-:-:4 CS2R c{:}, SR_CLOCKLO;"
117+
IADD = "--:-:-:-:5 IADD3 e, c{:}, -c{:}, RZ;"
118+
STG = "--:-:-:-:4 STG.E.SYS [clock_lo+{:}], e;"
119+
120+
for i in range(loop_size):
121+
SASS_CODE += [LDC.format(i)]
122+
SASS_CODE += [CS2R.format(i+1)]
46123

47124
for i in range(loop_size):
125+
SASS_CODE += [IADD.format(i+1, i)]
48126
SASS_CODE += [STG.format(hex(i*4), i)]
49127

50128
out_ = "\n" + "\n".join(SASS_CODE) + "\n"

sass_cubin/warp_schedule.sass

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ out_ = "\n" + "\n".join(SASS_CODE) + "\n"
6060

6161
--:-:-:-:4 BAR.SYNC 0x0;
6262

63-
--:-:-:-:5 IADD3 e1, c2, -c1, RZ;
63+
--:-:-:-:5 IADD3 e1, c2, -c1, RZ;
6464

6565
--:-:0:-:5 IMAD.WIDE clock_offset_lo, laneid, 0x4, clock_lo;
6666
--:-:-:-:4 MOV clock_offset_hi, clock_hi;

0 commit comments

Comments
 (0)