Skip to content

Commit

Permalink
FIX Results.
Browse files Browse the repository at this point in the history
  • Loading branch information
andre-martins committed Jun 27, 2014
2 parents ef6d483 + a98cffa commit 71e401b
Show file tree
Hide file tree
Showing 28 changed files with 68,678 additions and 1,845 deletions.
7 changes: 7 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,10 @@
* Introduced third-order parts (as described in our ACL 2013 short paper).
* Windows-compatible (tested under MSVC; thanks to Afonso Mendes).
* TurboTagger is now faster.

2014-06-26 TurboParser 2.2.0 [email protected]
New release with some additional features:
* Implemented a Python wrapper using Cython.
* Added a semantic parser, TurboSemanticParser (as described in our SemEval
2014 paper)
* Added a tokenizer for Portuguese.
26 changes: 26 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,32 @@ scripts/train_test_parser.sh \
scripts/train_test_tagger.sh \
data/sample/sample_train.conll \
data/sample/sample_test.conll \
scripts_srl/eval08.pl \
scripts_srl/remove_augmented.py \
scripts_srl/train_test_semantic_parser.sh \
scripts_srl/train_test_submission_closed.sh \
scripts_srl/train_test_submission_open.sh \
semeval2014_data/README \
semeval2014_data/scripts/augment_with_companion_data.py \
semeval2014_data/scripts/dev_ids \
semeval2014_data/scripts/generate_all_splits.sh \
semeval2014_data/scripts/split_data.py \
semeval2014_data/scripts/train+dev_ids \
semeval2014_data/scripts/train_ids \
libturboparser/Makefile \
libturboparser/TurboParserInterface.cpp \
libturboparser/TurboParserInterface.h \
python/install_wrapper.sh \
python/lemmatizer.py \
python/nlp_pipeline.py \
python/README \
python/setup.py \
python/turbo_parser.pyx \
python/tokenizers/__init__.py \
python/tokenizers/portuguese/__init__.py \
python/tokenizers/portuguese/clitics.py \
python/tokenizers/portuguese/contractions.py \
python/tokenizers/portuguese/word_tokenizer.py \
install_deps.sh \
deps/AD3-2.0.2.tar.gz \
deps/gflags-2.0-no-svn-files.tar.gz \
Expand Down
2 changes: 1 addition & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.

AC_PREREQ([2.59])
AC_INIT([TurboParser], [2.1.0], [[email protected]], [TurboParser],
AC_INIT([TurboParser], [2.2.0], [[email protected]], [TurboParser],
[http://www.ark.cs.cmu.edu/TurboParser/])

AM_INIT_AUTOMAKE([1.10 -Wall no-define])
Expand Down
26 changes: 26 additions & 0 deletions python/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
To install the Python wrapper for Turbo Parser, please follow the instructions
below. You need to have Cython (at least version 0.19) installed. To install
Cython, type

easy_install cython

Then run:

./install_wrapper.sh

This should create a file turbo_parser.so in local subfolder build/<something>
(e.g. build/lib.linux-x86_64-2.7). Create a symbolic link to this file in the
python folder:

ln -s build/lib.linux-x86_64-2.7/turboparser.so turboparser.so

To test, open a Python shell and type:

>>> import nlp_pipeline

(You may need to add the library path first, e.g.

export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:<root folder>/deps/local/lib"

where <root folder> is an absolute path to the folder where TurboParser is
located.)
18 changes: 18 additions & 0 deletions python/install_wrapper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

# Root folder where TurboParser is located.
root_folder="`cd $(dirname $0);cd ..;pwd`"

# Lib folder.
lib_folder=${root_folder}/libturboparser

# Python folder.
python_folder=${root_folder}/python

# Make a static lib.
cd $lib_folder
make

# Now use cython to build a Python wrapper.
cd $python_folder
python setup.py build_ext
53 changes: 53 additions & 0 deletions python/lemmatizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import pdb

class BasicLemmatizer:
def __init__(self):
self.lemmas = {}

def load_lemmatizer_model(self, file_model):
self.lemmas = {}
f = open(file_model)
for line in f:
line = line.rstrip('\n')
fields = line.split('\t')
self.lemmas[(fields[0], fields[1])] = fields[2]
f.close()

def lemmatize_sentence(self, tokenized_sentence, tags):
lemmas = []
for word, tag in zip(tokenized_sentence, tags):
if (word, tag) in self.lemmas:
lemma = self.lemmas[(word, tag)]
else:
lemma = word
lemmas.append(lemma)
return lemmas

def lemmatize(self, file_test, file_prediction):
f = open(file_test)
f_out = open(file_prediction, 'w')
for line in f:
line = line.rstrip('\n')
if line == '':
f_out.write(line + '\n')
continue
elif line.startswith('#begin document'):
f_out.write(line + '\n')
continue
elif line.startswith('#end document'):
f_out.write(line + '\n')
continue

fields = line.split()
word = fields[1]
tag = fields[3]
if (word, tag) in self.lemmas:
lemma = self.lemmas[(word, tag)]
else:
lemma = word
fields_out = fields
fields_out[2] = lemma
f_out.write('\t'.join(fields_out) + '\n')

f.close()
f_out.close()
54 changes: 38 additions & 16 deletions python/nlp_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,31 @@
import nltk
import tokenizers.portuguese.cintil_tokenizer as tokenizer_PT
import tokenizers.portuguese.word_tokenizer as tokenizer_PT
import lemmatizer
import turboparser as tp
import pdb

class NLPPipelineWorker:
def __init__(self, pipeline, language):
self.tagger = pipeline.turbo_interface.create_tagger()
self.parser = pipeline.turbo_interface.create_parser()
if language == 'PT':
self.tagger = pipeline.turbo_interface.create_tagger()
self.parser = pipeline.turbo_interface.create_parser()
self.lemmatizer = None
if language == 'PT':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
self.word_tokenizer = tokenizer_PT.PortugueseFlorestaWordTokenizer()
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_parser_pruned-true_model-standard.model')
self.lemmatizer = lemmatizer.BasicLemmatizer()
self.lemmatizer.load_lemmatizer_model('/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_lemmatizer.model')
elif language == 'PT-Cintil':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
self.word_tokenizer = tokenizer_PT.PortugueseCintilWordTokenizer()
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/portuguese_cetem-depbank/portuguese_cetem-depbank_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/portuguese_cetem-depbank/portuguese_cetem-depbank_parser_pruned-true_model-standard.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/portuguese_cetem-depbank/portuguese_cetem-depbank_parser_pruned-true_model-standard.model')
elif language == 'ES':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
self.word_tokenizer = nltk.TreebankWordTokenizer() # For now...
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/spanish_ancora_finertags_nomwe_auto/spanish_ancora_finertags_nomwe_auto_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/spanish_ancora_finertags_nomwe_auto/spanish_ancora_finertags_nomwe_auto_parser_pruned-true_model-standard.model')
elif language == 'EN':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
self.word_tokenizer = nltk.TreebankWordTokenizer()
Expand Down Expand Up @@ -59,15 +73,22 @@ def tag(self, tokenized_sentence, language):
fields = line.split('\t')
tag = fields[1]
tags.append(tag)
f_tagging_pred.close()
return tags

def parse(self, tokenized_sentence, tags, language):
f_tagging_pred.close()
if worker.lemmatizer != None:
lemmas = worker.lemmatizer.lemmatize_sentence(tokenized_sentence,
tags)
else:
lemmas = ['_' for token in tokenized_sentence]
return tags, lemmas

def parse(self, tokenized_sentence, tags, lemmas, language):
worker = self.get_worker(language)
f_conll = open('conll.tmp', 'w')
for i, token in enumerate(tokenized_sentence):
tag = tags[i]
f_conll.write(str(i+1) + '\t' + token + '\t_\t' + tag + '\t' + tag + '\t_\t_\t_\n')
lemma = lemmas[i]
f_conll.write(str(i+1) + '\t' + token + '\t' + lemma + '\t' +
tag + '\t' + tag + '\t_\t_\t_\n')
f_conll.close()
worker.parser.parse('conll.tmp', 'conll.tmp.pred')
f_conll_pred = open('conll.tmp.pred')
Expand All @@ -79,24 +100,25 @@ def parse(self, tokenized_sentence, tags, language):
if line == '':
continue
fields = line.split('\t')
lemma = fields[2]
head = int(fields[6])
deprel = fields[7]
heads.append(head)
deprels.append(deprel)
f_conll_pred.close()
f_conll_pred.close()
return heads, deprels

def parse_conll(self, text, language):
sentences = self.split_sentences(text, language)
conll_str = ''
for sentence in sentences:
tokenized_sentence = self.tokenize(sentence, language)
tags = self.tag(tokenized_sentence, language)
heads, deprels = self.parse(tokenized_sentence, tags, language)
tags, lemmas = self.tag(tokenized_sentence, language)
heads, deprels = self.parse(tokenized_sentence, tags, lemmas,
language)
for i, token in enumerate(tokenized_sentence):
conll_str += str(i+1) + '\t' + token + '\t_\t' + tags[i] + '\t' + tags[i] + '\t_\t' + str(heads[i]) + '\t' + deprels[i] + '\n'
conll_str += str(i+1) + '\t' + token + '\t' + lemmas[i] + '\t' + tags[i] + '\t' + tags[i] + '\t_\t' + str(heads[i]) + '\t' + deprels[i] + '\n'
conll_str += '\n'
#print conll_str
return conll_str


141 changes: 0 additions & 141 deletions python/tokenizers/portuguese/cintil_tokenizer.py

This file was deleted.

Loading

0 comments on commit 71e401b

Please sign in to comment.