From e52166cfc423daa5814b6a55a3313f1272f0375e Mon Sep 17 00:00:00 2001 From: Allen Goodman Date: Wed, 22 May 2024 10:22:48 -0400 Subject: [PATCH] cleanup --- .../_trainable_protein_mlm_tokenizer.py | 81 ------------------- 1 file changed, 81 deletions(-) delete mode 100644 src/beignet/tokenizers/_trainable_protein_mlm_tokenizer.py diff --git a/src/beignet/tokenizers/_trainable_protein_mlm_tokenizer.py b/src/beignet/tokenizers/_trainable_protein_mlm_tokenizer.py deleted file mode 100644 index eafc11431a..0000000000 --- a/src/beignet/tokenizers/_trainable_protein_mlm_tokenizer.py +++ /dev/null @@ -1,81 +0,0 @@ -import itertools -import os - -import tokenizers -from datasets import load_dataset - -from beignet.tokenizers import ProteinMLMTokenizer - - -class TrainableProteinMLMTokenizer(ProteinMLMTokenizer): - def __init__(self, **kwargs): - self._tokenizer, self._trainer = self._build_tokenizer(**kwargs) - os.environ["TOKENIZERS_PARALLELISM"] = "true" - - def _build_tokenizer(self, **kwargs): - pad_token = kwargs.get("pad_token", "") - unk_token = kwargs.get("unk_token", "") - max_vocab_size = kwargs.get("max_vocab_size", 1280) - - tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE(unk_token=unk_token)) - tokenizer.normalizer = tokenizers.normalizers.NFKC() - tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel() - trainer = tokenizers.trainers.BpeTrainer( - vocab_size=max_vocab_size, - initial_alphabet=[ - "A", - "R", - "N", - "D", - "C", - "E", - "Q", - "G", - "H", - "I", - "L", - "K", - "M", - "F", - "P", - "S", - "T", - "W", - "Y", - "V", - ".", - "-", - ], - special_tokens=["", "", "", "", ""], - ) - - tokenizer.special_tokens_map = {"pad_token": pad_token, "unk_token": unk_token} - return tokenizer, trainer - - @staticmethod - def _batch_iterator(hf_dataset, batch_size, text_column): - for i in range(0, len(hf_dataset), batch_size): - yield hf_dataset[i : i + batch_size][text_column] - - @staticmethod - def _batch_txt_to_hf_iterator(txt_file, batch_size, text_column="text"): - hf_dataset = load_dataset(text_column, data_files=[txt_file]) - for i in range(0, len(hf_dataset["train"]), batch_size): - yield hf_dataset["train"][i : i + batch_size][text_column] - - @staticmethod - def _batch_txt_iterator(txt_file, num_lines): - with open(txt_file, "r") as f: - return list(itertools.islice(f, num_lines)) - - def fit(self, txt_file, num_lines=100): - self._tokenizer.train_from_iterator( - self._batch_txt_iterator(txt_file, num_lines), - trainer=self._trainer, - # length=len(hf_dataset), - ) - super().__init__(tokenizer_object=self._tokenizer) - # setattr(self, "model_input_names", ["input_ids"]) - self.model_input_names = ["input_ids"] - for k, v in self._tokenizer.special_tokens_map.items(): - setattr(self, k, v)