-
Notifications
You must be signed in to change notification settings - Fork 43
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of github.com:lab-ml/python_autocomplete
merge
- Loading branch information
Showing
17 changed files
with
2,188 additions
and
956 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,9 @@ | ||
from labml import experiment, lab | ||
|
||
if __name__ == '__main__': | ||
experiment.save_bundle(lab.get_path() / 'bundle.tar.gz', '39b03a1e454011ebbaff2b26e3148b3d', | ||
data_files=['cache/itos.json', 'cache/n_tokens.json', 'cache/stoi.json']) | ||
experiment.save_bundle(lab.get_path() / 'bundle.tar.gz', 'a6cff3706ec411ebadd9bf753b33bae6', | ||
data_files=['cache/itos.json', | ||
'cache/n_tokens.json', | ||
'cache/stoi.json', | ||
'cache/bpe.json', | ||
]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import string | ||
from typing import Dict, List, Tuple | ||
|
||
ID_CHARS = set(string.ascii_letters + string.digits + '_') | ||
|
||
|
||
class Tokenizer: | ||
n_tokens: int | ||
itos: List[str] | ||
stoi: Dict[str, int] | ||
is_trained: int | ||
|
||
def encode(self, data: str, *, is_silent: bool = True): | ||
raise NotImplementedError | ||
|
||
def train(self, data: str): | ||
pass | ||
|
||
def rstrip(self, data: str) -> Tuple[str, List[int]]: | ||
return data, self.encode(data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
from typing import List, Tuple | ||
|
||
from labml import monit | ||
from python_autocomplete.dataset import ID_CHARS | ||
|
||
|
||
class WordTokenizer: | ||
def collect_words(self, data: str): | ||
raise NotImplementedError | ||
|
||
def get_words(self) -> Tuple[List[str], List[int]]: | ||
raise NotImplementedError | ||
|
||
def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]: | ||
raise NotImplementedError | ||
|
||
|
||
class SourceCodeTokenizer(WordTokenizer): | ||
def __init__(self): | ||
self.words = {} | ||
|
||
def add_word(self, word): | ||
if not word: | ||
return | ||
|
||
if word not in self.words: | ||
self.words[word] = 1 | ||
else: | ||
self.words[word] += 1 | ||
|
||
def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]: | ||
last_idx = 0 | ||
is_id = False | ||
res = [] | ||
|
||
for i, c in monit.enum('Collect words', data, is_silent=is_silent): | ||
if c in ID_CHARS: | ||
if not is_id: | ||
if last_idx < i: | ||
res.append(data[last_idx:i]) | ||
last_idx = i | ||
is_id = True | ||
else: | ||
if is_id: | ||
if last_idx < i: | ||
res.append(data[last_idx:i]) | ||
last_idx = i | ||
is_id = False | ||
|
||
if last_idx < len(data): | ||
res.append(data[last_idx:]) | ||
|
||
return res | ||
|
||
def collect_words(self, data: str): | ||
last_idx = 0 | ||
is_id = False | ||
|
||
for i, c in monit.enum('Collect words', data): | ||
if c in ID_CHARS: | ||
if not is_id: | ||
self.add_word(data[last_idx:i]) | ||
last_idx = i | ||
is_id = True | ||
else: | ||
if is_id: | ||
self.add_word(data[last_idx:i]) | ||
last_idx = i | ||
is_id = False | ||
|
||
self.add_word(data[last_idx:]) | ||
|
||
def get_words(self): | ||
words_list = [(f, w) for w, f in self.words.items()] | ||
words_list.sort(key=lambda x: -x[0]) | ||
|
||
return [w for _, w in words_list], [f for f, _ in words_list] | ||
|
||
|
||
class NoTokenizer(WordTokenizer): | ||
def __init__(self): | ||
self.data = '' | ||
|
||
def collect_words(self, data): | ||
self.data += data | ||
|
||
def get_words(self): | ||
return [self.data], [1] | ||
|
||
def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]: | ||
return [data] |
Oops, something went wrong.