Skip to content

Commit

Permalink
ENH Added configuration file for the Python wrapper; added header wit…
Browse files Browse the repository at this point in the history
…h the model version to the parser.
  • Loading branch information
andre-martins committed Sep 22, 2014
1 parent 999676f commit 01b5c7a
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 58 deletions.
49 changes: 49 additions & 0 deletions python/nlp_pipeline.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
PT
splitter="tokenizers/punkt/portuguese.pickle"
tagger="/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_tagger.model"
parser="/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_parser_pruned-true_model-standard.model"
lemmatizer="/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_lemmatizer.model"

ES
splitter="tokenizers/punkt/spanish.pickle"
tagger="/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_tagger.model"
parser="/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_parser_pruned-true_model-standard.model"
semantic_parser="/home/atm/workspace/CPP/TurboParser/srl/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_semantic_parser_conll2008_pruned-false_model-basic_syntax-true_C-0.01_fp-0.4_fn-0.6.model"
lemmatizer="/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_lemmatizer.model"

EN
splitter="tokenizers/punkt/english.pickle"
tagger="/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_tagger.model"
parser="/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_parser_pruned-true_model-standard.model"

EN-Nonprojective
splitter="tokenizers/punkt/english.pickle"
tagger="/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_tagger.model"
parser="/home/atm/workspace/CPP/TurboParser/models/english/english_parser_pruned-true_model-standard.model"
semantic_parser="/home/atm/workspace/CPP/TurboParser/srl/models/english/english_semantic_parser_conll2008_pruned-false_model-basic_syntax-true_C-0.01_fp-0.4_fn-0.6.model"
lemmatizer="/home/atm/workspace/CPP/TurboParser/models/english/english_lemmatizer.model"

PT-BR-Universal
splitter="tokenizers/punkt/portuguese.pickle"
tagger="/home/atm/workspace/CPP/TurboParser/models/brazilian_portuguese_universal/brazilian_portuguese_universal_tagger.model"
parser="/home/atm/workspace/CPP/TurboParser/models/brazilian_portuguese_universal/brazilian_portuguese_universal_parser_pruned-true_model-standard.model"

ES-Universal
splitter="tokenizers/punkt/spanish.pickle"
tagger="/home/atm/workspace/CPP/TurboParser/models/spanish_universal/spanish_universal_tagger.model"
parser="/home/atm/workspace/CPP/TurboParser/models/spanish_universal/spanish_universal_parser_pruned-true_model-standard.model"

FR-Universal
splitter="tokenizers/punkt/french.pickle"
tagger="/home/atm/workspace/CPP/TurboParser/models/french_universal/french_universal_tagger.model"
parser="/home/atm/workspace/CPP/TurboParser/models/french_universal/french_universal_parser_pruned-true_model-standard.model"

IT-Universal
splitter="tokenizers/punkt/italian.pickle"
tagger="/home/atm/workspace/CPP/TurboParser/models/italian_universal/italian_universal_tagger.model"
parser="/home/atm/workspace/CPP/TurboParser/models/italian_universal/italian_universal_parser_pruned-true_model-standard.model"

DE-Universal
splitter="tokenizers/punkt/german.pickle"
tagger="/home/atm/workspace/CPP/TurboParser/models/german_universal/german_universal_tagger.model"
parser="/home/atm/workspace/CPP/TurboParser/models/german_universal/german_universal_parser_pruned-true_model-standard.model"
116 changes: 59 additions & 57 deletions python/nlp_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,82 +2,84 @@
import tokenizers.portuguese.word_tokenizer as tokenizer_PT
import lemmatizer
import turboparser as tp
import os
import pdb

class NLPPipelineWorker:
def __init__(self, pipeline, language):
self.tagger = pipeline.turbo_interface.create_tagger()
self.parser = pipeline.turbo_interface.create_parser()
self.tagger = None
self.parser = None
self.semantic_parser = None
self.lemmatizer = None

if language not in pipeline.models:
print 'Error: no model for language %s.' % language
raise NotImplementedError

if 'splitter' in pipeline.models[language]:
self.sent_tokenizer = nltk.data.load(pipeline.models[language]['splitter'])
else:
# If no splitter is specified, use the English model.
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

if language == 'PT':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
self.word_tokenizer = tokenizer_PT.PortugueseFlorestaWordTokenizer()
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_parser_pruned-true_model-standard.model')
self.lemmatizer = lemmatizer.BasicLemmatizer()
self.lemmatizer.load_lemmatizer_model('/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_lemmatizer.model')
elif language == 'PT-Cintil':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
self.word_tokenizer = tokenizer_PT.PortugueseCintilWordTokenizer()
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/portuguese_cetem-depbank/portuguese_cetem-depbank_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/portuguese_cetem-depbank/portuguese_cetem-depbank_parser_pruned-true_model-standard.model')
elif language == 'ES':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
else:
self.word_tokenizer = nltk.TreebankWordTokenizer() # For now...
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_parser_pruned-true_model-standard.model')
self.semantic_parser = pipeline.turbo_interface.create_semantic_parser()
self.semantic_parser.load_semantic_parser_model('/home/atm/workspace/CPP/TurboParser/srl/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_semantic_parser_conll2008_pruned-false_model-basic_syntax-true_C-0.01_fp-0.4_fn-0.6.model')

if 'tagger' in pipeline.models[language]:
self.tagger = pipeline.turbo_interface.create_tagger()
self.tagger.load_tagger_model(pipeline.models[language]['tagger'])
if 'parser' in pipeline.models[language]:
self.parser = pipeline.turbo_interface.create_parser()
self.parser.load_parser_model(pipeline.models[language]['parser'])
if 'lemmatizer' in pipeline.models[language]:
self.lemmatizer = lemmatizer.BasicLemmatizer()
self.lemmatizer.load_lemmatizer_model('/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_lemmatizer.model')
elif language == 'EN':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
self.word_tokenizer = nltk.TreebankWordTokenizer()
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_parser_pruned-true_model-standard.model')
elif language == 'EN-Nonprojective':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
self.word_tokenizer = nltk.TreebankWordTokenizer()
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/english/english_parser_pruned-true_model-standard.model')
self.lemmatizer.load_lemmatizer_model(pipeline.models[language]['lemmatizer'])
if 'semantic_parser' in pipeline.models[language]:
self.semantic_parser = pipeline.turbo_interface.create_semantic_parser()
self.semantic_parser.load_semantic_parser_model('/home/atm/workspace/CPP/TurboParser/srl/models/english/english_semantic_parser_conll2008_pruned-false_model-basic_syntax-true_C-0.01_fp-0.4_fn-0.6.model')
self.lemmatizer = lemmatizer.BasicLemmatizer()
self.lemmatizer.load_lemmatizer_model('/home/atm/workspace/CPP/TurboParser/models/english/english_lemmatizer.model')
elif language == 'PT-BR-Universal':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
self.word_tokenizer = nltk.TreebankWordTokenizer() # For now...
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/brazilian_portuguese_universal/brazilian_portuguese_universal_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/brazilian_portuguese_universal/brazilian_portuguese_universal_parser_pruned-true_model-standard.model')
elif language == 'ES-Universal':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
self.word_tokenizer = nltk.TreebankWordTokenizer() # For now...
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/spanish_universal/spanish_universal_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/spanish_universal/spanish_universal_parser_pruned-true_model-standard.model')
elif language == 'FR-Universal':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')
self.word_tokenizer = nltk.TreebankWordTokenizer() # For now...
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/french_universal/french_universal_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/french_universal/french_universal_parser_pruned-true_model-standard.model')
elif language == 'IT-Universal':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/italian.pickle')
self.word_tokenizer = nltk.TreebankWordTokenizer() # For now...
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/italian_universal/italian_universal_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/italian_universal/italian_universal_parser_pruned-true_model-standard.model')
elif language == 'DE-Universal':
self.sent_tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
self.word_tokenizer = nltk.TreebankWordTokenizer() # For now...
self.tagger.load_tagger_model('/home/atm/workspace/CPP/TurboParser/models/german_universal/german_universal_tagger.model')
self.parser.load_parser_model('/home/atm/workspace/CPP/TurboParser/models/german_universal/german_universal_parser_pruned-true_model-standard.model')
else:
raise NotImplementedError
self.semantic_parser.load_semantic_parser_model(pipeline.models[language]['semantic_parser'])


class NLPPipeline:
def __init__(self):
# Load the initialization file.
configuration_filepath = os.path.dirname(os.path.realpath(__file__)) + \
os.sep + 'nlp_pipeline.config'
self.models = {}
self.load_configuration_file(configuration_filepath)
self.turbo_interface = tp.PTurboParser()
self.workers = {}

def load_configuration_file(self, filepath):
f = open(filepath)
language = ''
for line in f:
line = line.rstrip('\r\n')
if line == '':
language = ''
continue
# Ignore comments.
index = line.find('#')
if index >= 0:
line = line[:index]
line = line.strip()
if line == '':
continue
if language == '':
language = line
print 'Loading information for %s' % language
self.models[language] = {}
else:
pair = line.split('=')
assert len(pair) == 2, pdb.set_trace()
name = pair[0]
value = pair[1].strip('"')
self.models[language][name] = value
f.close()

def get_worker(self, language):
if language in self.workers:
return self.workers[language]
Expand Down
24 changes: 23 additions & 1 deletion src/parser/DependencyPipe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,37 @@

using namespace std;

// Define the current model version and the oldest back-compatible version.
// The format is AAAA.BBBB.CCCC, e.g., 2 0003 0000 means "2.3.0".
const uint64_t kParserModelVersion = 200030000;
const uint64_t kOldestCompatibleParserModelVersion = 200030000;
const uint64_t kParserModelCheck = 1234567890;

void DependencyPipe::SaveModel(FILE* fs) {
bool success;
success = WriteUINT64(fs, kParserModelCheck);
CHECK(success);
success = WriteUINT64(fs, kParserModelVersion);
CHECK(success);
token_dictionary_->Save(fs);
Pipe::SaveModel(fs);
pruner_parameters_->Save(fs);
}

void DependencyPipe::LoadModel(FILE* fs) {
bool success;
uint64_t model_check;
uint64_t model_version;
success = ReadUINT64(fs, &model_check);
CHECK(success);
CHECK_EQ(model_check, kParserModelCheck)
<< "The model file is too old and not supported anymore.";
success = ReadUINT64(fs, &model_version);
CHECK(success);
CHECK_GE(model_version, kOldestCompatibleParserModelVersion)
<< "The model file is too old and not supported anymore.";
delete token_dictionary_;
CreateTokenDictionary();
CreateTokenDictionary();
static_cast<DependencyDictionary*>(dictionary_)->
SetTokenDictionary(token_dictionary_);
token_dictionary_->Load(fs);
Expand Down

0 comments on commit 01b5c7a

Please sign in to comment.