-
Notifications
You must be signed in to change notification settings - Fork 0
/
superstring
executable file
·307 lines (265 loc) · 9.96 KB
/
superstring
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
#!/usr/bin/env python3
"""
Approximate Shortest Superstring Generator
https://github.com/eloj/superstrings/
Copyright © 2024 Eddy Jansson
Licensed under the MIT License. See LICENSE file for details.
"""
from math import log2, ceil, gcd
import sys
import argparse
import random
import ssp
# TODO: bits/bytes helper print function
# TODO: transforms: (test with emoji!)
# TODO: --bps --bits-per-symbol [auto,1-32] (default=8)
# TODO: --to-bitstrings
# TODO: ?? --from-bitstrings (i.e input already bitstrings, adapt bps etc)
# TODO: ?? min/max klen should be in bits?
# TODO: bitmap(s) to denote word starts, endings, iff variable klen.
PROGRAM_VERSION = "1.0.0"
def bits(n):
""" Return the number of bits needed to encode the number _n_ """
if n == 0:
return 1
return ceil(log2(n))
#def calculate_bits_per_character(string, char_base='A'):
# a = 0
# for ch in string:
# a |= ord(ch) - ord(char_base)
# # print(a, bin(a))
# return bits(a)
def data_loader(args):
""" Load data into array """
res = []
lines = []
if not args.infile:
lines = sys.stdin.readlines()
else:
with open(args.infile, 'r', encoding="utf-8") as fp:
lines = fp.readlines()
for v in lines:
is_comment = False
for prefix in args.comment:
if len(prefix) > 0 and v.startswith(prefix):
is_comment = True
break
if is_comment:
continue
res.append(v.strip())
return res
def apply_move_to_front(arr, args):
""" Move each item in args.mtf to the front of the array """
for k in args.mtf:
idx = arr.index(k)
if idx > 0:
arr.insert(0, arr.pop(idx))
if not args.quiet:
print(f"Moved {arr[0]} at index {idx} to front.")
if __name__ == "__main__":
# TODO: Add convert_arg_line_to_args= that splits by spaces but handles quoted strings.
argp = argparse.ArgumentParser(
description = "Approximate Shortest Superstring Generator -- https://github.com/eloj/superstrings",
fromfile_prefix_chars='@',
epilog="You can also supply arguments from a file using the '@argsfile' syntax.",
formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=28),
)
vgroup = argp.add_mutually_exclusive_group()
vgroup.add_argument("-q", "--quiet", help="Least verbose", action="store_true")
vgroup.add_argument("-v", "--verbose", help="Increase output verbosity", action="count", default=0)
order_group = argp.add_mutually_exclusive_group()
order_group.add_argument("-s", "--shuffle", help="Shuffle the input", action="store_true", default=False)
order_group.add_argument("-S", "--sort", help="Sort input by entry frequency", action="store_true", default=False)
order_group.add_argument("-L", "--loops", help="Shuffle and regenerate until min-length doesn't improve", action="store", type=int, default=0)
order_group.add_argument("-B", "--brute", help="Use brute-force. Warning: Only for tiny inputs!", action="store_true")
argp.add_argument("-C", "--comment", metavar="STR", help="String(s) that start a comment in the input", action="extend", nargs=1, type=str)
argp.add_argument("-F", "--mtf", metavar="STR", help="Input element(s) to move-to-front", action="extend", nargs=1, type=str)
argp.add_argument("-j", "--join-only", help="Only join input, don't generate superstring", action="store_true")
# argp.add_argument("--bps", help="Encode as n bits-per-symbol", action="extend", type=int, default=8)
argp.add_argument("-i", "--index-table", help="Always output offset/index table", action="store_true")
argp.add_argument("-l", "--length-table", help="Always output lengths table", action="store_true")
argp.add_argument("-R", "--reduce-lengths", help="Reduce lengths based on minimum entry length/GCD", action="store_true")
argp.add_argument("-G", "--reduce-offsets", help="Reduce offsets based on their GCD (gen. indeces)", action="store_true")
argp.add_argument("-V", "--version", help="Display program version and exit", action="version", version=PROGRAM_VERSION)
# default=sys.stdin, type=argparse.FileType('r')
argp.add_argument("infile", nargs="?", help="File containing set of strings, one per line")
args = argp.parse_args()
# https://github.com/python/cpython/issues/60603
if args.comment is None:
args.comment = ['#']
if args.verbose > 1:
print(f"Comment prefix(es): {args.comment}")
original_input = data_loader(args)
input_bps = 0
transformed_input = []
min_klen = 2**16
max_klen = 0
original_input_len = 0
unique_input_len = 0
for v in original_input:
klen = len(v)
if klen < min_klen:
min_klen = klen
if klen > max_klen:
max_klen = klen
original_input_len += klen
print_length_table = args.length_table or (args.index_table and (min_klen != max_klen))
if args.sort:
if not args.quiet:
print("Removing duplicates by sorting input by frequency.")
hist = {}
for i, v in enumerate(original_input):
if v in hist:
hist[v] += 1
else:
hist[v] = 1
unique_input = sorted(hist, key=hist.get, reverse=True)
if args.verbose > 1:
print("Histogram:")
for k in unique_input:
print(f"{k} => {hist[k]}")
else:
if not args.quiet:
print("Removing duplicates from input.")
unique_input = [i for n, i in enumerate(original_input) if i not in original_input[:n]]
if args.shuffle:
if not args.quiet:
print("Shuffling input.")
random.shuffle(unique_input)
if args.mtf is not None:
if args.verbose > 0:
print(f"Applying move to front on input set {args.mtf}")
apply_move_to_front(unique_input, args)
for v in unique_input:
unique_input_len += len(v)
if args.brute:
if not args.quiet:
if args.verbose > 0:
print("WARNING: Using brute-force is extremely slow. Use only on very small inputs.")
if len(unique_input) >= 30:
print(f"WARNING: This run is unlikely to return a result in reasonable time (N={len(unique_input)}).")
# Calculate fixed bits-per-symbol from input.
## if input_bps == 0:
## input_alphabet = "".join(sorted(set("".join(unique_input))))
## input_bps = calculate_bits_per_character(input_alphabet, input_alphabet[0])
## if not args.quiet:
## print(f"Auto-detected bits-per-symbol of input: {input_bps}")
#
# TODO: transformations: n-bit re-encode + bitstring support
#
transformed_input = unique_input
#
# Output stats on input data set
#
if args.verbose:
print(f"{len(original_input)} strings (total len={original_input_len}, min/max klen={min_klen}/{max_klen}) in input, {len(unique_input)} unique strings (total len={unique_input_len}) remain.")
if not args.quiet:
if args.verbose:
print("Final pre-processed input:")
print(transformed_input)
output = None
if args.join_only:
output = "".join(transformed_input)
else:
if args.brute:
min_output = ssp.generate_superstring(transformed_input, ssp.brutedp)
else:
min_output = ssp.generate_superstring(transformed_input)
if args.loops:
# Shuffle input and regenerate superstring until minimum length doesn't improve.
if not args.quiet:
print(f"Optimizing with {args.loops} loops per iteration.")
total_iters = iters = 0
while True:
iters += 1
total_iters += 1
random.shuffle(transformed_input)
s = ssp.generate_superstring(transformed_input)
if len(s) < len(min_output):
if args.verbose:
print(f"New best: {len(s)} at iteration {iters}, restarting.")
min_output = s
iters = 0
if not args.quiet and args.loops > 50 and iters % 50 == 0:
print(".", end="", flush=True)
if iters >= args.loops:
break
if args.verbose:
print(f"Steady state reached after {total_iters} total iterations.")
output = min_output
#
# Calculate and verify offsets
#
offsets = []
lengths = []
total_index_bits_estimate = 0
for v in original_input:
ofs = output.index(v)
total_index_bits_estimate += bits(ofs)
offsets.append(ofs)
lengths.append(len(v))
# Verification step:
if v != output[ofs:ofs+len(v)]:
print(f"Expected {v} at offset {ofs}, got {output[ofs:ofs+len(v)]}")
sys.exit("ERROR in encoding! You've done goofed up!")
#
# Output alphabet and alphabet size
#
if args.verbose > 0:
output_alphabet = "".join(sorted(set(output)))
print(f"Output alphabet size={len(output_alphabet)}:\n{output_alphabet}")
#
# Output superstring
#
if not args.quiet:
opt_len = len(output)
saved = original_input_len - opt_len
saved_unique = unique_input_len - opt_len
print(f"Generated Superstring is {opt_len} characters, saving {saved} on original, {saved_unique} on unique:")
print(output)
#
# Output index table
#
if args.index_table:
index_factor_str = ""
if args.reduce_offsets:
factor = gcd(*offsets)
if factor > 1:
if args.verbose:
print(f"Reducing offsets by a factor of {factor}.")
offsets = [a // factor for a in offsets]
index_factor_str = f"/{factor}"
else:
if args.verbose:
print("Offset reduction not possible.")
if not args.quiet:
max_index = max(offsets)
max_index_bits = bits(max_index)
index_size_max = max_index_bits * len(offsets) // 8
print(f"The {len(offsets)} verified ok offsets{index_factor_str} (~{total_index_bits_estimate}/{index_size_max}*8 bits, min unit={max_index_bits} bits) are:")
print(offsets)
#
# Output length table
#
if print_length_table:
length_factor_str = ""
if args.reduce_lengths:
factor = gcd(*lengths)
# Only divide by factor is term size is non-uniform.
if factor > 1 and (min_klen != max_klen):
if args.verbose:
print(f"Reducing lengths by a factor of {factor}.")
lengths = [a // factor for a in lengths]
length_factor_str = f"/{factor}"
else:
if args.verbose:
print(f"Reducing lengths by {min_klen}.")
lengths = [a - min_klen for a in lengths]
length_factor_str = f"-{min_klen}"
if not args.quiet:
max_length_bits = bits(max(lengths))
index_length_max = max_length_bits * len(lengths) // 8
print(f"The {len(lengths)} lengths{length_factor_str} with min/max={min_klen}/{max_klen}, req. bits/entry={max_length_bits} bits, ({index_length_max} bytes total) are:")
if args.verbose and max_length_bits > bits(max_klen - min_klen):
print("NOTE: Lengths may be optimized for smaller unit size, see --reduce-lengths")
print(lengths)