Skip to content

Commit

Permalink
Merge branch 'master' of github.com:lab-ml/python_autocomplete
Browse files Browse the repository at this point in the history
merge
  • Loading branch information
vpj committed Feb 16, 2021
2 parents 03915d7 + 6ab0b1d commit 043e4c7
Show file tree
Hide file tree
Showing 17 changed files with 2,188 additions and 956 deletions.
533 changes: 47 additions & 486 deletions notebooks/evaluate.ipynb

Large diffs are not rendered by default.

944 changes: 944 additions & 0 deletions notebooks/evaluate_old.ipynb

Large diffs are not rendered by default.

189 changes: 189 additions & 0 deletions notebooks/highlight.ipynb

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions python_autocomplete/bundle.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from labml import experiment, lab

if __name__ == '__main__':
experiment.save_bundle(lab.get_path() / 'bundle.tar.gz', '39b03a1e454011ebbaff2b26e3148b3d',
data_files=['cache/itos.json', 'cache/n_tokens.json', 'cache/stoi.json'])
experiment.save_bundle(lab.get_path() / 'bundle.tar.gz', 'a6cff3706ec411ebadd9bf753b33bae6',
data_files=['cache/itos.json',
'cache/n_tokens.json',
'cache/stoi.json',
'cache/bpe.json',
])
20 changes: 20 additions & 0 deletions python_autocomplete/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import string
from typing import Dict, List, Tuple

ID_CHARS = set(string.ascii_letters + string.digits + '_')


class Tokenizer:
n_tokens: int
itos: List[str]
stoi: Dict[str, int]
is_trained: int

def encode(self, data: str, *, is_silent: bool = True):
raise NotImplementedError

def train(self, data: str):
pass

def rstrip(self, data: str) -> Tuple[str, List[int]]:
return data, self.encode(data)
205 changes: 122 additions & 83 deletions python_autocomplete/bpe.py → python_autocomplete/dataset/bpe.py
Original file line number Diff line number Diff line change
@@ -1,116 +1,146 @@
import string
from functools import lru_cache
from heapq import heappush, heappop
from typing import List, Tuple
from typing import List

from labml import lab, monit
from labml.utils.cache import cache_set
from python_autocomplete.dataset import Tokenizer
from python_autocomplete.dataset.break_words import SourceCodeTokenizer

ID_CHARS = set(string.ascii_letters + string.digits + '_')

class BPE(Tokenizer):
def __init__(self, bpe_en_de: 'BPEEnDe', word_tokenizer):
self.bpe = bpe_en_de
self.word_tokenizer = word_tokenizer
self.is_trained = True

class BPE:
def __init__(self):
self.char_itos = []
self.char_stoi = {}
self.bpe_itos = []
self.bpe = []
self.common = {}
@property
def n_tokens(self):
return len(self.bpe.bpe)

self.bpe_itos = self.calc_bpe_itos()
@property
def itos(self):
return self.bpe.bpe_itos

def to_char_stoi(self, w: str):
return [self.char_stoi[c] for c in w]
@property
def stoi(self):
return self.bpe.bpe_stoi

def calc_bpe_itos(self):
itos = list(self.char_itos)
itos += [itos[p1] + itos[p2] for p1, p2 in self.bpe[len(self.char_itos):]]
return itos
def encode(self, data: str, *, is_silent: bool = True):
words = self.word_tokenizer.tokenize(data, is_silent=is_silent)

res = []
for w in monit.iterate('Encode words', words, is_silent=is_silent):
res += self.bpe.encode(w)

class Tokenizer:
def collect_words(self, data: str):
raise NotImplementedError
return res

def get_words(self) -> Tuple[List[str], List[int]]:
raise NotImplementedError
def __call__(self, data: str):
encoded = self.encode(data)
return [self.itos[c] for c in encoded]

def tokenize(self, data: str) -> List[str]:
raise NotImplementedError
def rstrip(self, data: str):
words = self.word_tokenizer.tokenize(data, is_silent=True)
words = words[:-1]
res = []
for w in words:
res += self.bpe.encode(w)

return ''.join(words), res

class SourceCodeTokenizer(Tokenizer):
def __init__(self):
self.words = {}

def add_word(self, word):
if not word:
return
class _BPEEncoder:
def __init__(self, pairs):
self.pairs = pairs
self.codes = []
self.next_idx = []
self.prev_idx = []
self.heap = []

if word not in self.words:
self.words[word] = 1
else:
self.words[word] += 1
def encode(self, codes: List[int]):
self.codes = codes
self.next_idx = BPELearner.default_next_pointers(len(codes))
self.prev_idx = BPELearner.default_prev_pointers(len(codes))
self.heap = []

def tokenize(self, data: str) -> List[str]:
last_idx = 0
is_id = False
res = []
for i in range(len(self.codes) - 1):
self.add_pair((self.codes[i], self.codes[i + 1]), i)

for i, c in monit.enum('Collect words', data):
if c in ID_CHARS:
if not is_id:
if last_idx < i:
res.append(data[last_idx:i])
last_idx = i
is_id = True
else:
if is_id:
if last_idx < i:
res.append(data[last_idx:i])
last_idx = i
is_id = False

if last_idx < len(data):
res.append(data[last_idx:])
while self.heap:
_, idx, pair = heappop(self.heap)
self.merge(idx, pair)

return res
return [c for c in self.codes if c != -1]

def collect_words(self, data: str):
last_idx = 0
is_id = False
def merge(self, p2, pair):
p3 = self.next_idx[p2]

if p3 == -1 or pair[0] != self.codes[p2] or pair[1] != self.codes[p3]:
return

for i, c in monit.enum('Collect words', data):
if c in ID_CHARS:
if not is_id:
self.add_word(data[last_idx:i])
last_idx = i
is_id = True
else:
if is_id:
self.add_word(data[last_idx:i])
last_idx = i
is_id = False
self.codes[p2] = self.pairs[pair]
self.codes[p3] = -1
p1 = self.prev_idx[p2]
p4 = self.next_idx[p3]

self.add_word(data[last_idx:])
if p1 != -1:
self.add_pair((self.codes[p1], self.codes[p2]), p1)
self.next_idx[p2] = p4
if p4 != -1:
self.prev_idx[p4] = p2
self.add_pair((self.codes[p2], self.codes[p4]), p2)

def get_words(self):
words_list = [(f, w) for w, f in self.words.items()]
words_list.sort(key=lambda x: -x[0])
def add_pair(self, pair, idx):
if pair not in self.pairs:
return

return [w for _, w in words_list], [f for f, _ in words_list]
heappush(self.heap, (self.pairs[pair], idx, pair))


class NoTokenizer(Tokenizer):
class BPEEnDe:
def __init__(self):
self.data = ''
self.char_itos = []
self.char_stoi = {}
self.bpe = []
self.popular_words = {}

self.bpe_itos = []
self.bpe_stoi = {}
self.pairs = {}
self.encoder = None

def load(self, char_itos, char_stoi, bpe):
self.char_itos = char_itos
self.char_stoi = char_stoi
self.bpe = bpe

self.calc()

def set_popular_words(self, popular_words):
self.popular_words = popular_words

def calc(self):
self.bpe_itos = self.calc_bpe_itos()
self.bpe_stoi = {s: i for i, s in enumerate(self.bpe_itos)}
self.pairs = {(p[0], p[1]): c for c, p in enumerate(self.bpe) if not isinstance(p, int)}

def collect_words(self, data):
self.data += data
self.encoder = _BPEEncoder(self.pairs)

def get_words(self):
return [self.data], [1]
def to_char_stoi(self, w: str):
return [self.char_stoi[c] for c in w]

def tokenize(self, data: str) -> List[str]:
return [data]
def calc_bpe_itos(self):
itos = list(self.char_itos)
for p1, p2 in self.bpe[len(self.char_itos):]:
itos.append(itos[p1] + itos[p2])
return itos

@lru_cache(1024)
def encode(self, word: str):
if word in self.popular_words:
return self.popular_words[word]

return self.encoder.encode([self.char_stoi[c] for c in word if c in self.char_stoi])


class BPELearner:
Expand Down Expand Up @@ -284,7 +314,7 @@ def main():
path = lab.get_data_path() / 'train.py'

with open(str(path), 'r') as f:
data = f.read()[:100_000]
data = f.read()

tokenizer = SourceCodeTokenizer()
tokenizer.collect_words(data)
Expand All @@ -295,6 +325,15 @@ def main():
print(bpe.bpe_itos()[len(bpe.char_itos):])
print(len(data), bpe.get_length())

cache_set('bpe', {
'char_itos': bpe.char_itos,
'char_stoi': bpe.char_stoi,
'bpe': bpe.bpe
})

bpe_en_de = BPEEnDe()
bpe_en_de.load(bpe.char_itos, bpe.char_stoi, bpe.bpe)


if __name__ == '__main__':
main()
91 changes: 91 additions & 0 deletions python_autocomplete/dataset/break_words.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from typing import List, Tuple

from labml import monit
from python_autocomplete.dataset import ID_CHARS


class WordTokenizer:
def collect_words(self, data: str):
raise NotImplementedError

def get_words(self) -> Tuple[List[str], List[int]]:
raise NotImplementedError

def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]:
raise NotImplementedError


class SourceCodeTokenizer(WordTokenizer):
def __init__(self):
self.words = {}

def add_word(self, word):
if not word:
return

if word not in self.words:
self.words[word] = 1
else:
self.words[word] += 1

def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]:
last_idx = 0
is_id = False
res = []

for i, c in monit.enum('Collect words', data, is_silent=is_silent):
if c in ID_CHARS:
if not is_id:
if last_idx < i:
res.append(data[last_idx:i])
last_idx = i
is_id = True
else:
if is_id:
if last_idx < i:
res.append(data[last_idx:i])
last_idx = i
is_id = False

if last_idx < len(data):
res.append(data[last_idx:])

return res

def collect_words(self, data: str):
last_idx = 0
is_id = False

for i, c in monit.enum('Collect words', data):
if c in ID_CHARS:
if not is_id:
self.add_word(data[last_idx:i])
last_idx = i
is_id = True
else:
if is_id:
self.add_word(data[last_idx:i])
last_idx = i
is_id = False

self.add_word(data[last_idx:])

def get_words(self):
words_list = [(f, w) for w, f in self.words.items()]
words_list.sort(key=lambda x: -x[0])

return [w for _, w in words_list], [f for f, _ in words_list]


class NoTokenizer(WordTokenizer):
def __init__(self):
self.data = ''

def collect_words(self, data):
self.data += data

def get_words(self):
return [self.data], [1]

def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]:
return [data]
Loading

0 comments on commit 043e4c7

Please sign in to comment.