-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquickFileTokenThing.py
117 lines (94 loc) · 3.73 KB
/
quickFileTokenThing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
Quickly check counts of a manually singled chunk
reads in a file and outputs counts and re-decoded text with tokens marked
"""
import tokensToUTF
from GPT2.encoder import get_encoder
###
inTextFile = "testText.txt"
logTokenIDs = False # toggle console output of token numbers list
logDiscord = True # toggle addition of discord code block delimiters for console output
logCounts = True # toggle console output of counts
doDecode = True # toggles output of re-decoded text with tokens marked
griffinMarker = False # adds a marker for Griffin's 15-token association range (currently 🐦💀)
dragonMarker = False # adds a marker for Dragon's 60-token association range (currently 🐲💀)
overrideKey = '' # a decoded token after which to reset the threshold markers, for formats that assume re-mentions
futureMan = False # thresholds adapted to futureman: ignores first << for marker grabbing; resets threshold at newline
sanitizeInput = False # toggles cleanup of common patterns, NOTE: this will most likely change tokenization!
sanitizeOutput = False # toggles insertion of newline markers in outputs
outputToFile = False # toggle file output of re-decoded text
outTextFile = "outText.txt"
###
fixEncodes = tokensToUTF.getFixEncodes()
text = open(inTextFile, "r", encoding="utf-8").read()
if sanitizeInput:
text = text.replace("\n\n"," ")
enc = get_encoder()
context_tokens = enc.encode(text)
if logTokenIDs:
tokenIDstring = f"\n{str(context_tokens)}"
else:
tokenIDstring = ""
if logDiscord:
wraps = "```"
else:
wraps = ""
if logCounts:
wordCount = len(text.split())
charCount = len(text)
tokenCount = len(context_tokens)
countsBit = f"\n{str(wordCount)} words, {str(charCount)} characters -> {str(tokenCount)} BPE tokens"
else:
countsBit = ''
# check what comes back out
tokenWarning = ""
keyMarker = None
if doDecode:
decoded = ""
tokenNumber = 0
if griffinMarker or dragonMarker:
tokenWarning = " - Any model should be fine with this."
if futureMan:
tokenWarning = " - Looks like functioning futureman!"
caughtKey = False
for token in context_tokens:
for key, value in fixEncodes.items():
if value == token:
decoded += key + "|"
curToken = key
if caughtKey == False:
if futureMan and key == '<<':
print("futureman mode, ignoring <<!")
else:
keyMarker = key
caughtKey = True
print(f'Extracted cutoff marker: "{keyMarker}"')
if overrideKey:
keyMarker = overrideKey
if keyMarker:
if curToken == '\n' and futureMan:
tokenNumber = 0
elif curToken == keyMarker:
tokenNumber = 0
else:
tokenNumber += 1
if tokenNumber == 15 and griffinMarker:
if futureMan:
tokenWarning = ' - This is too much for futureman!'
else:
tokenWarning = ' - This might be too much for Griffin!'
decoded += '🐦💀'
if tokenNumber == 60 and dragonMarker:
tokenWarning = ' - This might be too much, even for Dragon!'
decoded += '🐲💀'
if sanitizeOutput:
finalTxt = decoded.replace('\n', '(newline)\n')
else:
finalTxt = decoded
decodedTextBit = f"\nTokenizes to:\n{finalTxt}"
else:
decodedTextBit = ""
print(f"{wraps}\n{text}{decodedTextBit}{tokenIDstring}{countsBit}{tokenWarning}\n{wraps}")
if outputToFile:
outText = open(outTextFile, "w", encoding="utf-8")
outText.write(f"{wraps}\n{text}{decodedTextBit}{tokenIDstring}{countsBit}{tokenWarning}\n{wraps}")