-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathcounting_sort.h
241 lines (210 loc) · 8.73 KB
/
counting_sort.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2010-2016 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#pragma once
#include <stdio.h>
#include <math.h>
#include "utilities.h"
#include "sequence_ops.h"
#include "transpose.h"
#include <chrono>
#include <thread>
// TODO
// Make sure works for inplace or not with regards to move_uninitialized
namespace pbbs {
// the following parameters can be tuned
constexpr const size_t SEQ_THRESHOLD = 8192;
constexpr const size_t BUCKET_FACTOR = 32;
constexpr const size_t LOW_BUCKET_FACTOR = 16;
// count number in each bucket
template <typename s_size_t, typename InSeq, typename KeySeq>
void seq_count_(InSeq In, KeySeq Keys,
s_size_t* counts, size_t num_buckets) {
size_t n = In.size();
// use local counts to avoid false sharing
size_t local_counts[num_buckets];
for (size_t i = 0; i < num_buckets; i++)
local_counts[i] = 0;
for (size_t j = 0; j < n; j++) {
size_t k = Keys[j];
if (k >= num_buckets)
throw std::runtime_error("bucket out of range in count_sort");
local_counts[k]++;
}
for (size_t i = 0; i < num_buckets; i++)
counts[i] = local_counts[i];
}
// write to destination, where offsets give start of each bucket
template <typename s_size_t, typename InSeq, typename KeySeq>
void seq_write_(InSeq In, typename InSeq::value_type* Out, KeySeq Keys,
s_size_t* offsets, size_t num_buckets) {
// copy to local offsets to avoid false sharing
size_t local_offsets[num_buckets];
for (size_t i = 0; i < num_buckets; i++)
local_offsets[i] = offsets[i];
for (size_t j = 0; j < In.size(); j++) {
size_t k = local_offsets[Keys[j]]++;
move_uninitialized(Out[k], In[j]);
}
}
// write to destination, where offsets give end of each bucket
template <typename s_size_t, typename InSeq, typename KeySeq>
void seq_write_down_(InSeq In, typename InSeq::value_type* Out, KeySeq Keys,
s_size_t* offsets, size_t) { // num_buckets) {
for (long j = In.size()-1; j >= 0; j--) {
long k = --offsets[Keys[j]];
move_uninitialized(Out[k], In[j]);
}
}
// Sequential counting sort internal
template <typename s_size_t, typename InS, typename OutS, typename KeyS>
void seq_count_sort_(InS In, OutS Out, KeyS Keys,
s_size_t* counts, size_t num_buckets) {
// count size of each bucket
seq_count_(In, Keys, counts, num_buckets);
// generate offsets for buckets
size_t s = 0;
for (size_t i = 0; i < num_buckets; i++) {
s += counts[i];
counts[i] = s;
}
// send to destination
seq_write_down_(In, Out.begin(), Keys, counts, num_buckets);
}
// Sequential counting sort
template <typename InS, typename OutS, typename KeyS>
sequence<size_t> seq_count_sort(InS& In, OutS& Out, KeyS& Keys,
size_t num_buckets) {
sequence<size_t> counts(num_buckets+1);
seq_count_sort_(In.slice(), Out.slice(), Keys.slice(),
counts.begin(), num_buckets);
counts[num_buckets] = In.size();
return counts;
}
// Parallel internal counting sort specialized to type for bucket counts
// returns counts, and a flag
// If skip_if_in_one and returned flag is true, then the Input was alread
// sorted, and it has not been moved to the output.
template <typename s_size_t,
typename InS, typename OutS, typename KeyS>
std::pair<sequence<size_t>, bool>
count_sort_(InS& In, OutS& Out, KeyS& Keys,
size_t num_buckets,
float parallelism=1.0,
bool skip_if_in_one = false) {
timer t("count sort", false);
using T = typename InS::value_type;
size_t n = In.size();
size_t num_threads = num_workers();
bool is_nested = parallelism < .5;
// pick number of blocks for sufficient parallelism but to make sure
// cost on counts is not to high (i.e. bucket upper).
size_t par_lower = 1 + round(num_threads * parallelism * 9);
size_t size_lower = 1; // + n * sizeof(T) / 2000000;
size_t bucket_upper = 1 + n * sizeof(T) / (4 * num_buckets * sizeof(s_size_t));
size_t num_blocks = std::min(bucket_upper, std::max(par_lower, size_lower));
// if insufficient parallelism, sort sequentially
if (n < SEQ_THRESHOLD || num_blocks == 1 || num_threads == 1) {
return std::make_pair(seq_count_sort(In,Out,Keys,num_buckets),false);}
size_t block_size = ((n-1)/num_blocks) + 1;
size_t m = num_blocks * num_buckets;
sequence<s_size_t> counts(m);
t.next("head");
// sort each block
parallel_for(0, num_blocks, [&] (size_t i) {
s_size_t start = std::min(i * block_size, n);
s_size_t end = std::min(start + block_size, n);
seq_count_(In.slice(start,end), Keys.slice(start,end),
counts.begin() + i*num_buckets, num_buckets);
}, 1, is_nested);
t.next("count");
sequence<size_t> bucket_offsets = sequence<size_t>::no_init(num_buckets+1);
parallel_for(0, num_buckets, [&] (size_t i) {
size_t v = 0;
for (size_t j= 0; j < num_blocks; j++)
v += counts[j*num_buckets + i];
bucket_offsets[i] = v;
}, 1 + 1024/num_blocks);
bucket_offsets[num_buckets] = 0;
// if all in one bucket, then no need to sort
size_t num_non_zero = 0;
for (size_t i =0 ; i < num_buckets; i++)
num_non_zero += (bucket_offsets[i] > 0);
size_t total = scan_inplace(bucket_offsets.slice(), addm<size_t>());
if (skip_if_in_one && num_non_zero == 1) {
return std::make_pair(std::move(bucket_offsets), true);}
if (total != n)
throw std::logic_error("in count_sort, internal bad count");
auto dest_offsets = sequence<s_size_t>::no_init(num_blocks*num_buckets);
parallel_for(0, num_buckets, [&] (size_t i) {
size_t v = bucket_offsets[i];
size_t start = i * num_blocks;
for (size_t j= 0; j < num_blocks; j++) {
dest_offsets[start+j] = v;
v += counts[j*num_buckets + i];
}
}, 1 + 1024/num_blocks);
sequence<s_size_t> counts2(m);
parallel_for(0, num_blocks, [&] (size_t i) {
size_t start = i * num_buckets;
for (size_t j= 0; j < num_buckets; j++)
counts2[start+j] = dest_offsets[j*num_blocks + i];
}, 1 + 1024/num_buckets);
t.next("buckets");
parallel_for(0, num_blocks, [&] (size_t i) {
s_size_t start = std::min(i * block_size, n);
s_size_t end = std::min(start + block_size, n);
seq_write_(In.slice(start,end), Out.begin(),
Keys.slice(start,end),
counts2.begin() + i*num_buckets, num_buckets);
}, 1, is_nested);
t.next("transpose");
return std::make_pair(std::move(bucket_offsets), false);
}
// If skip_if_in_one and returned flag is true, then the Input was alread
// sorted, and it has not been moved to the output.
template <typename InS, typename KeyS>
std::pair<sequence<size_t>, bool>
count_sort(InS const &In,
range<typename InS::value_type*> Out,
KeyS const &Keys,
size_t num_buckets,
float parallelism = 1.0,
bool skip_if_in_one=false) {
size_t n = In.size();
if (n != Out.size() || n != Keys.size())
throw std::invalid_argument("lengths don't match in call to count_sort");
size_t max32 = ((size_t) 1) << 32;
if (n < max32 && num_buckets < max32)
// use 4-byte counters when larger ones not needed
return count_sort_<uint32_t>(In, Out, Keys, num_buckets,
parallelism, skip_if_in_one);
return count_sort_<size_t>(In, Out, Keys, num_buckets,
parallelism, skip_if_in_one);
}
template <typename InS, typename KeyS>
std::pair<sequence<typename InS::value_type>, sequence<size_t>>
count_sort(InS const &In, KeyS const &Keys, size_t num_buckets) {
sequence<typename InS::value_type> Out(In.size());
auto a = count_sort(In, Out.slice(), Keys, num_buckets);
return std::make_pair(std::move(Out), std::move(a.first));
}
}