Skip to content

Commit

Permalink
refactor basic tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
sagorbrur committed Jul 10, 2023
1 parent b1c63e1 commit 73519bc
Showing 1 changed file with 0 additions and 2 deletions.
2 changes: 0 additions & 2 deletions bnlp/tokenizer/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ class BasicTokenizer:
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
# handle (.) in bangla text

orig_tokens = whitespace_tokenize(text)
# print("original tokens: ", orig_tokens)
Expand All @@ -75,7 +74,6 @@ def tokenize(self, text):

# print("split tokens: ", split_tokens)
output_tokens = whitespace_tokenize(" ".join(split_tokens))
# get (.) back in output tokens
return output_tokens

def _run_strip_accents(self, text):
Expand Down

0 comments on commit 73519bc

Please sign in to comment.