-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathcutil.cu
174 lines (142 loc) · 5.1 KB
/
cutil.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
// Copyright Douglas Goddard 2016
// Licensed under the MIT license
#include "csolv.h"
// this is specific to N=200, K=9
__global__ void expand_array(
unsigned char *d_in,
unsigned char *d_out)
{
uint32_t offset = blockDim.x * blockIdx.x + threadIdx.x;
unsigned char *input = d_in+offset*5*sizeof(unsigned char);
unsigned char *output = d_out+offset*6*sizeof(unsigned char);
output[0] = input[0] >> 4;
output[1] = input[0] << 4 | input[1] >> 4;
output[2] = input[1] << 4 | input[2] >> 4;
output[3] = input[2] & 0xf;
output[4] = input[3];
output[5] = input[4];
}
__global__ void get_sort_keys(
unsigned char* d_hashes,
uint32_t *d_sort_keys,
uint32_t *d_sort_indices,
uint32_t r,
uint32_t num_keys)
{
uint32_t index = blockDim.x * blockIdx.x + threadIdx.x;
if(index < num_keys) {
d_sort_indices[index] = index;
unsigned char* input = d_hashes+index*30*sizeof(unsigned char)+3*r;
uint32_t sort_key = input[0] << 16 | input[1] << 8 | input[2];
d_sort_keys[index] = sort_key;
}
}
void cuda_sort_step(
unsigned char* d_hashes,
uint32_t *d_sort_indices_out,
uint32_t r,
uint32_t num_keys)
{
uint32_t *d_sort_keys;
uint32_t *d_sort_indices;
cudaMalloc(&d_sort_keys, sizeof(uint32_t)*num_keys);
cudaMalloc(&d_sort_indices, sizeof(uint32_t)*num_keys);
uint32_t block_count = num_keys / BLOCK_SIZE;
if(num_keys % BLOCK_SIZE != 0)
block_count += 1;
get_sort_keys<<<block_count, BLOCK_SIZE>>>(d_hashes, d_sort_keys, d_sort_indices, r, num_keys);
thrust::device_vector<uint32_t> dv_sort_keys(d_sort_keys, d_sort_keys+num_keys);
thrust::device_vector<uint32_t> dv_sort_indices(d_sort_indices, d_sort_indices+num_keys);
cudaFree(d_sort_keys);
cudaFree(d_sort_indices);
thrust::sort_by_key(dv_sort_keys.begin(), dv_sort_keys.end(), dv_sort_indices.begin());
thrust::copy(dv_sort_indices.begin(), dv_sort_indices.end(), d_sort_indices_out);
}
__global__ void initialize_to_one(
uint32_t *reduction,
uint32_t size)
{
uint32_t t_index = blockDim.x * blockIdx.x + threadIdx.x;
if(t_index < size) {
reduction[t_index] = 1;
}
}
void recover_indices_recursive(
std::vector<uint32_t> *indices,
uint32_t **i_bufs,
uint32_t **j_bufs,
uint32_t r,
uint32_t index)
{
if(r == 0) {
printf("ADDING:\t%x, %x\n", i_bufs[r][index], j_bufs[r][index]);
indices->push_back(i_bufs[r][index]);
indices->push_back(j_bufs[r][index]);
} else {
uint32_t i = i_bufs[r][index];
uint32_t j = j_bufs[r][index];
printf("index:\t%x\n", index);
printf("i, j:\t%x, %x\n", i, j);
recover_indices_recursive(indices, i_bufs, j_bufs, r-1, i);
recover_indices_recursive(indices, i_bufs, j_bufs, r-1, j);
}
}
void cuda_expand_array(
unsigned char* h_in,
uint32_t in_size,
unsigned char **d_out,
uint32_t out_size)
{
unsigned char *d_in;
cudaMalloc(&d_in, in_size);
cudaMemcpy(d_in, h_in, sizeof(unsigned char)*in_size, cudaMemcpyHostToDevice);
uint32_t thread_count = in_size / 5;
uint32_t block_count = thread_count / BLOCK_SIZE;
expand_array<<<block_count, BLOCK_SIZE>>>(d_in, *d_out);
cudaFree(d_in);
}
void cuda_solve_hashes(
unsigned char* h_in,
uint32_t in_size,
unsigned char *h_out,
uint32_t out_size)
{
unsigned char *d_out;
cudaMalloc(&d_out, out_size);
cuda_expand_array(h_in, in_size, &d_out, out_size);
// copy out hashes so we can reuse d_out in loop
cudaMemcpy(h_out, d_out, out_size, cudaMemcpyDeviceToHost);
// track sizes
uint32_t *buf_sizes = (uint32_t *)malloc(sizeof(uint32_t)*(CUDA_K+1));
buf_sizes[0] = (1<<21);
uint32_t *d_ij_buf;
uint32_t *d_sort_indices;
// for(int r=0; r<3; r++) {
for(int r=0; r<CUDA_K; r++) {
// realloc sort_indices to size
cudaMalloc(&d_sort_indices, sizeof(uint32_t)*buf_sizes[r]);
cuda_sort_step(d_out, d_sort_indices, r, buf_sizes[r]);
if(r < (CUDA_K-1))
cuda_xor_step(&d_out, d_sort_indices, &d_ij_buf, r, buf_sizes);
else
cuda_final_collision_step(&d_out, d_sort_indices, &d_ij_buf, r, buf_sizes);
cudaFree(d_sort_indices);
}
// std::vector<uint32_t> indices;
// recover_indices_recursive(&indices, i_bufs, j_bufs, 3, 0x1bf94);
// for(int i=0; i<indices.size(); i++) {
// printf("%x\n", indices[i]);
// }
// uint32_t *h_sort_indices = (uint32_t *)malloc(sizeof(uint32_t)*(1<<21));
// cudaMemcpy(h_sort_indices, d_sort_indices, sizeof(uint32_t)*(1<<21), cudaMemcpyDeviceToHost);
// printf("HI: %x\n", h_sort_indices[0]);
// printf("HI: %x\n", h_sort_indices[1]);
// printf("HI: %x\n", h_sort_indices[2]);
// for(int i=0; i<25; i++) {
// unsigned char *hash = h_out+h_sort_indices[i]*30;
// for(int j=0; j<30; j++)
// printf("%02x ", hash[j]);
// printf("\n");
// }
cudaFree(d_out);
}