forked from karpathy/llama2.c
-
Notifications
You must be signed in to change notification settings - Fork 10
/
tokenizer.py
117 lines (97 loc) · 3.76 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# Taken from llama code and lightly modified
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
import argparse
import array
import os
import struct
from pathlib import Path
from typing import List
import tiktoken
from tiktoken.load import load_tiktoken_bpe
TOKENIZER_MODEL = "tokenizer.model" # the llama tiktoken tokenizer model
class Tokenizer:
pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
def __init__(self, tokenizer_model=None):
model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
assert os.path.isfile(model_path), model_path
mergeable_ranks = load_tiktoken_bpe(model_path)
self.model_path = model_path
# BOS / EOS token IDs
num_base_tokens = len(mergeable_ranks)
num_reserved_special_tokens = 256
special_tokens = [
"<|begin_of_text|>",
"<|end_of_text|>",
"<|reserved_special_token_0|>",
"<|reserved_special_token_1|>",
"<|reserved_special_token_2|>",
"<|reserved_special_token_3|>",
"<|start_header_id|>",
"<|end_header_id|>",
"<|reserved_special_token_4|>",
"<|eot_id|>", # end of turn
] + [
f"<|reserved_special_token_{i}|>"
for i in range(5, num_reserved_special_tokens - 5)
]
self.special_tokens = {
token: num_base_tokens + i for i, token in enumerate(special_tokens)
}
self.model = tiktoken.Encoding(
name=Path(model_path).name,
pat_str=self.pat_str,
mergeable_ranks=mergeable_ranks,
special_tokens=self.special_tokens,
)
self.n_words = self.model.n_vocab
self.bos_id = self.special_tokens["<|begin_of_text|>"]
self.eos_id = self.special_tokens["<|end_of_text|>"]
self.pad_id = -1
self.stop_tokens = {
self.special_tokens["<|end_of_text|>"],
self.special_tokens["<|eot_id|>"],
}
def encode(
self, s: str, bos: bool, eos: bool, allowed_special, disallowed_special
) -> List[int]:
assert type(s) is str
self.model.encode(
substr,
allowed_special=allowed_special,
disallowed_special=disallowed_special,
)
if bos:
t.insert(0, self.bos_id)
if eos:
t.append(self.eos_id)
return t
def decode(self, t: List[int]) -> str:
return self.model.decode(t)
def export(self):
# get all the tokens (postprocessed) and their scores as floats
tokens, scores = [], []
for i in range(self.n_words):
# decode the token and light postprocessing
t = self.model.decode_single_token_bytes(i)
s = i
tokens.append(t)
scores.append(s)
# record the max token length
max_token_length = max(len(t) for t in tokens)
# write to a binary file
# the tokenizer.bin file is the same as .model file, but .bin
tokenizer_bin = self.model_path.replace(".model", ".bin")
with open(tokenizer_bin, "wb") as f:
f.write(struct.pack("I", max_token_length))
for bytes, score in zip(tokens, scores):
f.write(struct.pack("fI", score, len(bytes)))
f.write(bytes)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-t", "--tokenizer-model", type=str, help="optional path to custom tokenizer "
)
args = parser.parse_args()
t = Tokenizer(args.tokenizer_model)
t.export()