From 107a84906fe11fc99e526c5ef8818e8471f3529f Mon Sep 17 00:00:00 2001 From: Andre Martins Date: Fri, 5 Dec 2014 10:29:36 +0000 Subject: [PATCH] FIX Made the Portuguese tokenizer split in tabs. --- python/tokenizers/portuguese/word_tokenizer.py | 3 +++ scripts_srl/evaluator | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python/tokenizers/portuguese/word_tokenizer.py b/python/tokenizers/portuguese/word_tokenizer.py index dbdb44e..81038de 100644 --- a/python/tokenizers/portuguese/word_tokenizer.py +++ b/python/tokenizers/portuguese/word_tokenizer.py @@ -85,6 +85,9 @@ def tokenize(self, text): # Note: the Portuguese sentence tokenizer should also do this!! text = re.sub('\xc2\xa0', ' ', text) + # Replace tabs by spaces [ATM 3/12/2014]. + text = re.sub('\t', ' ', text) + # Replace U+0096 by dashes. text = re.sub('\xc2\x96', ' -- ', text) diff --git a/scripts_srl/evaluator b/scripts_srl/evaluator index b43ebdd..5a93180 160000 --- a/scripts_srl/evaluator +++ b/scripts_srl/evaluator @@ -1 +1 @@ -Subproject commit b43ebdd8bced2c5d7d026f76547129c2beb13feb +Subproject commit 5a93180098f76a08ada25344f52f73538574e98b