-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathdata-conversion.py
72 lines (58 loc) · 2.73 KB
/
data-conversion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import json
import base64
import struct
# Load the tokenizer.json file that was distributed with the LLaMA model
d = None
with open(r"tokenizer.json", 'r', encoding='utf-8') as f:
d = json.load(f)
# Extract the vocabulary as a list of token strings
vocab = []
for token in d['model']['vocab']:
vocab.append(token)
# Transform the vocabulary into a UTF-8 String delimited by line breaks, base64 encode it, and save to a file
with open('vocab_base64.txt', 'wb') as f:
f.write(base64.b64encode(('\n').join(vocab).encode("utf-8")))
# Extract the merge data as a list of strings, where location in list indicates priority of merge.
# Example: one merge might be "gr a" (indicating that "gr" and "a" merge into "gra")
merges = []
for merge in d['model']['merges']:
merges.append(merge)
# Create helper map where keys are token Strings, values are their positions in the vocab.
# Note that positions of the vocabulary do not have any special meaning in the tokenizer,
# we are merely using them to aid with compressing the data.
vocab_map = {}
for i,v in enumerate(vocab):
vocab_map[v] = i
print(len(vocab_map))
# Each merge can be represented with 2 integers, e.g. "merge the 5th and the 11th token in vocab".
integers = []
for merge in merges:
f, t = merge.split(" ")
integers.append(vocab_map[f])
integers.append(vocab_map[t])
# We are going to compress the merge data into a binary format.
# Since the vocabulary has fewer than 2^17 entries, each integer can be represented with 17 bits.
# The first 2*17 bits define the first merge, the next 2*17 bits define the second merge, and so on.
# First, construct an array of 17-character strings where each character is either '0' or '1'
bit_strings = []
for integer in integers:
bit_string = bin(integer)[2:]
bit_string_padded = bit_string.zfill(17)
bit_strings.append(bit_string_padded)
# We are going to transform the bit_strings array into a combined bit string
# We need the combined bit string to be divisible by 8 in order to convert it to bytes,
# so add a zero-padding-string at the end if needed.
count_bits = 17 * len(bit_strings)
count_needed_padding = count_bits % 8
if count_needed_padding != 0:
pad_string = '0' * count_needed_padding
bit_strings.append(pad_string)
# String of zeros and ones, representing all the merges
combined_bit_string = ''.join(bit_strings)
# Partition the string into 8-bit chunks and convert them to ints
int_chunks = [int(combined_bit_string[i:i+8], 2) for i in range(0, len(combined_bit_string), 8)]
# Convert the integer chunks into bytes
byte_chunks = struct.pack(f'{len(int_chunks)}B', *int_chunks)
# Save the byte array as base64 encoded file
with open('merges_binary.bin', 'wb') as file:
file.write(base64.b64encode(byte_chunks))