diff --git a/word_language_model/data.py b/word_language_model/data.py index a4e353b0c8..5ecdd8744e 100644 --- a/word_language_model/data.py +++ b/word_language_model/data.py @@ -27,7 +27,7 @@ def tokenize(self, path): """Tokenizes a text file.""" assert os.path.exists(path) # Add words to the dictionary - with open(path, 'r') as f: + with open(path, 'r', encoding="utf8") as f: tokens = 0 for line in f: words = line.split() + [''] @@ -36,7 +36,7 @@ def tokenize(self, path): self.dictionary.add_word(word) # Tokenize file content - with open(path, 'r') as f: + with open(path, 'r', encoding="utf8") as f: ids = torch.LongTensor(tokens) token = 0 for line in f: