FIX Made the Portuguese tokenizer split in tabs.

Cocophotos · Dec 5, 2014 · 107a849 · 107a849
1 parent 99ed0ee
commit 107a849
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 1 deletion.
diff --git a/python/tokenizers/portuguese/word_tokenizer.py b/python/tokenizers/portuguese/word_tokenizer.py
@@ -85,6 +85,9 @@ def tokenize(self, text):
         # Note: the Portuguese sentence tokenizer should also do this!!
         text = re.sub('\xc2\xa0', ' ', text)
 
+        # Replace tabs by spaces [ATM 3/12/2014].
+        text = re.sub('\t', ' ', text)
+
         # Replace U+0096 by dashes.
         text = re.sub('\xc2\x96', ' -- ', text)
 

diff --git a/scripts_srl/evaluator b/scripts_srl/evaluator