Skip to content

Commit

Permalink
ENH Added semantic parsing to the pipeline.
Browse files Browse the repository at this point in the history
  • Loading branch information
andre-martins committed Sep 19, 2014
1 parent 01e13f5 commit 999676f
Show file tree
Hide file tree
Showing 13 changed files with 388 additions and 182 deletions.
39 changes: 36 additions & 3 deletions libturboparser/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@ UTIL = ../src/util
CLASSIFIER = ../src/classifier
TAGGER = ../src/tagger
PARSER = ../src/parser
SEMANTICPARSER = ../src/semantic_parser
AUXLIBS = ../deps/local/lib
AUXINCLUDES = ../deps/local/include

OBJS = TurboParserInterface.o DependencyDecoder.o DependencyDictionary.o DependencyFeatures.o DependencyInstance.o DependencyInstanceNumeric.o DependencyOptions.o DependencyPart.o DependencyPipe.o DependencyReader.o DependencyWriter.o SequenceDecoder.o SequenceDictionary.o SequenceFeatures.o SequenceInstance.o SequenceInstanceNumeric.o SequenceOptions.o SequencePart.o SequencePipe.o SequenceReader.o SequenceWriter.o TokenDictionary.o Alphabet.o Dictionary.o Options.o Parameters.o Pipe.o Reader.o Writer.o AlgUtils.o SerializationUtils.o StringUtils.o TimeUtils.o
OBJS = TurboParserInterface.o SemanticDecoder.o SemanticDictionary.o SemanticFeatures.o SemanticInstanceNumeric.o SemanticInstance.o SemanticOptions.o SemanticPart.o SemanticPipe.o SemanticReader.o SemanticWriter.o DependencyDecoder.o DependencyDictionary.o DependencyFeatures.o DependencyInstance.o DependencyInstanceNumeric.o DependencyOptions.o DependencyPart.o DependencyPipe.o DependencyReader.o DependencyWriter.o SequenceDecoder.o SequenceDictionary.o SequenceFeatures.o SequenceInstance.o SequenceInstanceNumeric.o SequenceOptions.o SequencePart.o SequencePipe.o SequenceReader.o SequenceWriter.o TokenDictionary.o Alphabet.o Dictionary.o Options.o Parameters.o Pipe.o Reader.o Writer.o AlgUtils.o SerializationUtils.o StringUtils.o TimeUtils.o
CC = g++
DEBUG = -g
INCLUDES = -I$(UTIL)/ -I$(CLASSIFIER) -I$(TAGGER) -I$(PARSER) -I$(AUXINCLUDES)
INCLUDES = -I$(UTIL)/ -I$(CLASSIFIER) -I$(TAGGER) -I$(PARSER) -I$(SEMANTICPARSER) -I$(AUXINCLUDES)
LIBS = -L/usr/local/lib/ -L$(AUXLIBS)
CFLAGS = -O3 -Wall -Wno-sign-compare -c -fmessage-length=0 -fPIC $(INCLUDES)
LFLAGS = $(LIBS) -lad3 -lgflags -lglog
Expand All @@ -18,11 +19,43 @@ all : libturboparser.a
libturboparser.a : $(OBJS)
ar rcs libturboparser.a $(OBJS)

TurboParserInterface.o: TurboParserInterface.h TurboParserInterface.cpp $(TAGGER)/SequencePipe.h $(PARSER)/DependencyPipe.h $(UTIL)/Utils.h
TurboParserInterface.o: TurboParserInterface.h TurboParserInterface.cpp $(TAGGER)/SequencePipe.h $(PARSER)/DependencyPipe.h $(SEMANTICPARSER)/SemanticPipe.h $(UTIL)/Utils.h
$(CC) $(CFLAGS) TurboParserInterface.cpp

#####################

SemanticDecoder.o: $(SEMANTICPARSER)/SemanticDecoder.h $(SEMANTICPARSER)/SemanticDecoder.cpp $(SEMANTICPARSER)/SemanticPart.h $(SEMANTICPARSER)/SemanticPipe.h $(PARSER)/FactorTree.h $(SEMANTICPARSER)/FactorPredicateAutomaton.h $(SEMANTICPARSER)/FactorArgumentAutomaton.h $(UTIL)/AlgUtils.h $(UTIL)/logval.h $(CLASSIFIER)/Decoder.h
$(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticDecoder.cpp

SemanticDictionary.o: $(SEMANTICPARSER)/SemanticDictionary.h $(SEMANTICPARSER)/SemanticDictionary.cpp $(SEMANTICPARSER)/SemanticPipe.h $(CLASSIFIER)/Dictionary.h $(TAGGER)/TokenDictionary.h $(UTIL)/SerializationUtils.h
$(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticDictionary.cpp

SemanticFeatures.o: $(SEMANTICPARSER)/SemanticFeatures.h $(SEMANTICPARSER)/SemanticFeatures.cpp $(SEMANTICPARSER)/SemanticPipe.h $(SEMANTICPARSER)/SemanticPart.h $(SEMANTICPARSER)/SemanticFeatureTemplates.h $(CLASSIFIER)/Features.h $(SEMANTICPARSER)/SemanticInstanceNumeric.h $(CLASSIFIER)/FeatureEncoder.h
$(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticFeatures.cpp

SemanticInstance.o: $(SEMANTICPARSER)/SemanticInstance.h $(SEMANTICPARSER)/SemanticInstance.cpp $(CLASSIFIER)/Instance.h
$(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticInstance.cpp

SemanticInstanceNumeric.o: $(SEMANTICPARSER)/SemanticInstanceNumeric.h $(SEMANTICPARSER)/SemanticInstanceNumeric.cpp $(SEMANTICPARSER)/SemanticInstance.h $(SEMANTICPARSER)/SemanticDictionary.h
$(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticInstanceNumeric.cpp

SemanticOptions.o: $(SEMANTICPARSER)/SemanticOptions.h $(SEMANTICPARSER)/SemanticOptions.cpp $(UTIL)/SerializationUtils.h $(CLASSIFIER)/Options.h
$(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticOptions.cpp

SemanticPart.o: $(SEMANTICPARSER)/SemanticPart.h $(SEMANTICPARSER)/SemanticPart.cpp $(CLASSIFIER)/Part.h
$(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticPart.cpp

SemanticPipe.o: $(SEMANTICPARSER)/SemanticPipe.h $(SEMANTICPARSER)/SemanticPipe.cpp $(CLASSIFIER)/Pipe.h $(SEMANTICPARSER)/SemanticOptions.h $(SEMANTICPARSER)/SemanticReader.h $(SEMANTICPARSER)/SemanticDictionary.h $(TAGGER)/TokenDictionary.h $(SEMANTICPARSER)/SemanticInstanceNumeric.h $(SEMANTICPARSER)/SemanticWriter.h $(SEMANTICPARSER)/SemanticPart.h $(SEMANTICPARSER)/SemanticFeatures.h $(SEMANTICPARSER)/SemanticDecoder.h
$(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticPipe.cpp

SemanticReader.o: $(SEMANTICPARSER)/SemanticReader.h $(SEMANTICPARSER)/SemanticReader.cpp $(SEMANTICPARSER)/SemanticInstance.h $(CLASSIFIER)/Reader.h
$(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticReader.cpp

SemanticWriter.o: $(SEMANTICPARSER)/SemanticWriter.h $(SEMANTICPARSER)/SemanticWriter.cpp $(SEMANTICPARSER)/SemanticInstance.h $(CLASSIFIER)/Writer.h
$(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticWriter.cpp

#####################

DependencyDecoder.o: $(PARSER)/DependencyDecoder.h $(PARSER)/DependencyDecoder.cpp $(PARSER)/DependencyPart.h $(PARSER)/DependencyPipe.h $(PARSER)/FactorTree.h $(PARSER)/FactorHeadAutomaton.h $(PARSER)/FactorGrandparentHeadAutomaton.h $(PARSER)/FactorTrigramHeadAutomaton.h $(PARSER)/FactorSequence.h $(UTIL)/AlgUtils.h $(UTIL)/logval.h $(CLASSIFIER)/Decoder.h
$(CC) $(CFLAGS) $(PARSER)/DependencyDecoder.cpp

Expand Down
56 changes: 56 additions & 0 deletions libturboparser/TurboParserInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,59 @@ void TurboParserWorker::Parse(const std::string &file_test,
<< " sec." << endl;
}

TurboSemanticParserWorker::TurboSemanticParserWorker() {
semantic_options_ = new SemanticOptions;
semantic_options_->Initialize();

semantic_pipe_ = new SemanticPipe(semantic_options_);
semantic_pipe_->Initialize();
}

TurboSemanticParserWorker::~TurboSemanticParserWorker() {
LOG(INFO) << "Deleting semantic pipe.";
delete semantic_pipe_;
LOG(INFO) << "Deleting semantic options.";
delete semantic_options_;
}

void TurboSemanticParserWorker::LoadSemanticParserModel(
const std::string &file_model) {
semantic_options_->SetModelFilePath(file_model);

int time;
timeval start, end;
gettimeofday(&start, NULL);

LOG(INFO) << "Loading model file " << file_model;

semantic_pipe_->LoadModelFile();

gettimeofday(&end, NULL);
time = diff_ms(end,start);

LOG(INFO) << "Took " << static_cast<double>(time)/1000.0
<< " sec." << endl;
}

void TurboSemanticParserWorker::ParseSemanticDependencies(
const std::string &file_test,
const std::string &file_prediction) {
semantic_options_->SetTestFilePath(file_test);
semantic_options_->SetOutputFilePath(file_prediction);

int time;
timeval start, end;
gettimeofday(&start, NULL);

semantic_pipe_->Run();

gettimeofday(&end, NULL);
time = diff_ms(end,start);

LOG(INFO) << "Took " << static_cast<double>(time)/1000.0
<< " sec." << endl;
}

TurboParserInterface::TurboParserInterface() {
argc_ = 0;
argv_ = NULL;
Expand All @@ -133,6 +186,9 @@ TurboParserInterface::~TurboParserInterface() {
LOG(INFO) << "Deleting parser workers.";
DeleteAllParsers();

LOG(INFO) << "Deleting semantic parser workers.";
DeleteAllSemanticParsers();

LOG(INFO) << "Clearing argument list.";
ClearArgumentList();

Expand Down
30 changes: 30 additions & 0 deletions libturboparser/TurboParserInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <stdlib.h>
#include "SequencePipe.h"
#include "DependencyPipe.h"
#include "SemanticPipe.h"

namespace TurboParserInterface {

Expand Down Expand Up @@ -35,6 +36,21 @@ class TurboParserWorker {
DependencyPipe *parser_pipe_;
};

class TurboSemanticParserWorker {
public:
TurboSemanticParserWorker();
virtual ~TurboSemanticParserWorker();

void LoadSemanticParserModel(const std::string &file_model);

void ParseSemanticDependencies(const std::string &file_test,
const std::string &file_prediction);

private:
SemanticOptions *semantic_options_;
SemanticPipe *semantic_pipe_;
};

class TurboParserInterface {
public:
TurboParserInterface();
Expand Down Expand Up @@ -67,6 +83,12 @@ class TurboParserInterface {
return parser;
}

TurboSemanticParserWorker *CreateSemanticParser() {
TurboSemanticParserWorker *semantic_parser = new TurboSemanticParserWorker();
semantic_parsers_.push_back(semantic_parser);
return semantic_parser;
}

void DeleteAllTaggers() {
for (int i = 0; i < taggers_.size(); ++i) {
delete taggers_[i];
Expand All @@ -81,11 +103,19 @@ class TurboParserInterface {
parsers_.clear();
}

void DeleteAllSemanticParsers() {
for (int i = 0; i < semantic_parsers_.size(); ++i) {
delete semantic_parsers_[i];
}
semantic_parsers_.clear();
}

private:
int argc_;
char** argv_;
vector<TurboTaggerWorker*> taggers_;
vector<TurboParserWorker*> parsers_;
vector<TurboSemanticParserWorker*> semantic_parsers_;
};

} // namespace TurboParserInterface.
57 changes: 52 additions & 5 deletions python/nlp_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class NLPPipelineWorker:
def __init__(self, pipeline, language):
self.tagger = pipeline.turbo_interface.create_tagger()
self.parser = pipeline.turbo_interface.create_parser()
self.semantic_parser = None
self.lemmatizer = None
if language == 'PT':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
Expand All @@ -26,13 +27,24 @@ def __init__(self, pipeline, language):
self.word_tokenizer = nltk.TreebankWordTokenizer() # For now...
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_parser_pruned-true_model-standard.model')
self.semantic_parser = pipeline.turbo_interface.create_semantic_parser()
self.semantic_parser.load_semantic_parser_model('/home/atm/workspace/CPP/TurboParser/srl/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_semantic_parser_conll2008_pruned-false_model-basic_syntax-true_C-0.01_fp-0.4_fn-0.6.model')
self.lemmatizer = lemmatizer.BasicLemmatizer()
self.lemmatizer.load_lemmatizer_model('/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_lemmatizer.model')
elif language == 'EN':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
self.word_tokenizer = nltk.TreebankWordTokenizer()
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_parser_pruned-true_model-standard.model')
elif language == 'EN-Nonprojective':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
self.word_tokenizer = nltk.TreebankWordTokenizer()
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/english/english_parser_pruned-true_model-standard.model')
self.semantic_parser = pipeline.turbo_interface.create_semantic_parser()
self.semantic_parser.load_semantic_parser_model('/home/atm/workspace/CPP/TurboParser/srl/models/english/english_semantic_parser_conll2008_pruned-false_model-basic_syntax-true_C-0.01_fp-0.4_fn-0.6.model')
self.lemmatizer = lemmatizer.BasicLemmatizer()
self.lemmatizer.load_lemmatizer_model('/home/atm/workspace/CPP/TurboParser/models/english/english_lemmatizer.model')
elif language == 'PT-BR-Universal':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
self.word_tokenizer = nltk.TreebankWordTokenizer() # For now...
Expand Down Expand Up @@ -60,30 +72,30 @@ def __init__(self, pipeline, language):
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/german_universal/german_universal_parser_pruned-true_model-standard.model')
else:
raise NotImplementedError

class NLPPipeline:
def __init__(self):
self.turbo_interface = tp.PTurboParser()
self.workers = {}

def get_worker(self, language):
if language in self.workers:
return self.workers[language]
else:
worker = NLPPipelineWorker(self, language)
self.workers[language] = worker
return worker

def split_sentences(self, text, language):
worker = self.get_worker(language)
sentences = worker.sent_tokenizer.tokenize(text)
return sentences

def tokenize(self, sentence, language):
worker = self.get_worker(language)
tokenized_sentence = worker.word_tokenizer.tokenize(sentence)
return tokenized_sentence

def tag(self, tokenized_sentence, language):
worker = self.get_worker(language)
f_tagging = open('tagging.tmp', 'w')
Expand Down Expand Up @@ -135,6 +147,41 @@ def parse(self, tokenized_sentence, tags, lemmas, language):
f_conll_pred.close()
return heads, deprels

def has_semantic_parser(self, language):
worker = self.get_worker(language)
return (worker.semantic_parser != None)

def parse_semantic_dependencies(self, tokenized_sentence, tags, lemmas,
heads, deprels, language):
worker = self.get_worker(language)
f_conll = open('conll2008.tmp', 'w')
for i, token in enumerate(tokenized_sentence):
tag = tags[i]
lemma = lemmas[i]
head = heads[i]
deprel = deprels[i]
f_conll.write(str(i+1) + '\t_\t_\t_\t_\t' + token + '\t' + lemma + \
'\t' + tag + '\t' + str(head) + '\t' + deprel + \
'\t_\n')
f_conll.close()
worker.semantic_parser.parse_semantic_dependencies('conll2008.tmp',
'conll2008.tmp.pred')
f_conll_pred = open('conll2008.tmp.pred')
predicates = []
argument_lists = []
for line in f_conll_pred:
line = line.rstrip('\n')
line = line.rstrip('\t')
if line == '':
continue
fields = line.split('\t')
predicate = fields[10]
argument_list = fields[11:]
predicates.append(predicate)
argument_lists.append(argument_list)
f_conll_pred.close()
return predicates, argument_lists

def parse_conll(self, text, language):
sentences = self.split_sentences(text, language)
conll_str = ''
Expand Down
2 changes: 1 addition & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@

setup(cmdclass={'build_ext': build_ext},
ext_modules=[Extension("turboparser", ["turbo_parser.pyx"], language="c++",
include_dirs=["../src/parser", "../src/tagger/", "../src/classifier/", "../src/util", "../deps/local/include/"],
include_dirs=["../src/semantic_parser", "../src/parser", "../src/tagger/", "../src/classifier/", "../src/util", "../deps/local/include/"],
library_dirs=[src, "../deps/local/lib/"], libraries=["turboparser", "gflags", "glog", "ad3"])])
29 changes: 29 additions & 0 deletions python/turbo_parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,16 @@ cdef extern from "../libturboparser/TurboParserInterface.h" namespace "TurboPars
void LoadParserModel(string file_model)
void Parse(string file_test, string file_prediction)

cdef cppclass TurboSemanticParserWorker:
TurboSemanticParserWorker()
void LoadSemanticParserModel(string file_model)
void ParseSemanticDependencies(string file_test, string file_prediction)

cdef cppclass TurboParserInterface:
TurboParserInterface()
TurboTaggerWorker* CreateTagger()
TurboParserWorker* CreateParser()
TurboSemanticParserWorker* CreateSemanticParser()


# Wrap them into python extension types.
Expand All @@ -47,6 +53,11 @@ cdef class PTurboParser:
parser.thisptr = self.thisptr.CreateParser()
return parser

def create_semantic_parser(self):
semantic_parser = PTurboSemanticParserWorker(allocate=False)
semantic_parser.thisptr = self.thisptr.CreateSemanticParser()
return semantic_parser

cdef class PTurboTaggerWorker:
cdef TurboTaggerWorker *thisptr
cdef bool allocate
Expand Down Expand Up @@ -82,3 +93,21 @@ cdef class PTurboParserWorker:

def parse(self, file_test, file_prediction):
self.thisptr.Parse(file_test, file_prediction)

cdef class PTurboSemanticParserWorker:
cdef TurboSemanticParserWorker *thisptr
cdef bool allocate
def __cinit__(self, allocate=False):
self.allocate = allocate
if allocate:
self.thisptr = new TurboSemanticParserWorker()

def __dealloc__(self):
if self.allocate:
del self.thisptr

def load_semantic_parser_model(self, file_model):
self.thisptr.LoadSemanticParserModel(file_model)

def parse_semantic_dependencies(self, file_test, file_prediction):
self.thisptr.ParseSemanticDependencies(file_test, file_prediction)
3 changes: 2 additions & 1 deletion scripts_srl/eval08.pl
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,8 @@ sub update_srl_scores
if($gold_prop->position() == $sys_prop->position()){
$counts->{coru_prop} ++;
$counts->{coru_prop_per_tag}{$gold_prop->pposs()} ++;
if($gold_prop->sense() == $sys_prop->sense()){
#if($gold_prop->sense() == $sys_prop->sense()){
if($gold_prop->sense() eq $sys_prop->sense()){
$counts->{corl_prop} ++;
$sent_counts{corl_prop} ++;
$counts->{corl_prop_per_tag}{$gold_prop->pposs()} ++;
Expand Down
Loading

0 comments on commit 999676f

Please sign in to comment.