diff --git a/mammoth/inputters/dataset.py b/mammoth/inputters/dataset.py index 96bebe74..f79bdd79 100644 --- a/mammoth/inputters/dataset.py +++ b/mammoth/inputters/dataset.py @@ -115,6 +115,7 @@ def _tokenize(self, string, side='src'): def _numericalize(self, tokens, side='src'): """Convert list of strings into list of indices""" + print(side, tokens) vocab = self.vocabs[side] bos = vocab[DefaultTokens.BOS] eos = vocab[DefaultTokens.EOS] @@ -124,6 +125,7 @@ def _numericalize(self, tokens, side='src'): *(vocab.stoi.get(token, unk) for token in tokens), eos, ], device='cpu') + print(indices) return indices def to(self, device): diff --git a/mammoth/transforms/tokenize.py b/mammoth/transforms/tokenize.py index 5c6e283a..53dda7cf 100644 --- a/mammoth/transforms/tokenize.py +++ b/mammoth/transforms/tokenize.py @@ -217,6 +217,7 @@ def _tokenize(self, tokens, side='src', is_train=False): segmented = sp_model.encode( sentence, out_type=str, enable_sampling=True, alpha=alpha, nbest_size=nbest_size ) + print(f'segmented: {segmented}') return segmented def apply(self, example, is_train=False, stats=None, **kwargs):