Skip to content

Commit

Permalink
ENH Created an abstract layer for sequence models, and separated it f…
Browse files Browse the repository at this point in the history
…rom the tagger.
  • Loading branch information
andre-martins committed Sep 24, 2014
1 parent 01b5c7a commit bedee9d
Show file tree
Hide file tree
Showing 39 changed files with 862 additions and 253 deletions.
2 changes: 1 addition & 1 deletion scripts/train_test_parser.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ large_feature_set=true # Use a large feature set (slower but more accurate).
case_sensitive=false # Distinguish word upper/lower case.
form_cutoff=0 # Cutoff in word occurrence.
lemma_cutoff=0 # Cutoff in lemma occurrence.
projective=false # If true, force single-rooted projective trees.
projective=true #false # If true, force single-rooted projective trees.
model_type=standard # Parts used in the model (subset of "af+cs+gp+as+hb+np+dp+gs+ts").
# Some shortcuts are: "standard" (means "af+cs+gp");
# "basic" (means "af"); and "full" (means "af+cs+gp+as+hb+gs+ts").
Expand Down
4 changes: 2 additions & 2 deletions scripts/train_test_tagger.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ language=$1 # Example: "slovene" or "english_proj".
train_algorithm=svm_mira # Training algorithm.
num_epochs=10 # Number of training epochs.
regularization_parameter=1e12 # The C parameter in MIRA.
train=true
train=false #true
test=true
model_type=2 # Second-order model (trigrams).
form_cutoff=1 # Word cutoff. Only words which occur more than these times won't be considered unknown.
Expand Down Expand Up @@ -75,7 +75,7 @@ then
--file_train=${file_train} \
--train_algorithm=${train_algorithm} \
--train_regularization_constant=${regularization_parameter} \
--tagger_model_type=${model_type} \
--sequence_model_type=${model_type} \
--form_cutoff=${form_cutoff} \
--logtostderr
fi
Expand Down
6 changes: 3 additions & 3 deletions src/parser/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
UTIL = ../util
CLASSIFIER = ../classifier
TAGGER = ../tagger
SEQUENCE = ../sequence

TurboParserprgdir = ../..
TurboParserprg_PROGRAMS = TurboParser
Expand All @@ -14,7 +14,7 @@ DependencyReader.cpp FactorHeadAutomaton.h DependencyDictionary.h \
DependencyInstance.h DependencyPart.cpp DependencyReader.h FactorSequence.h \
DependencyFeatures.cpp DependencyInstanceNumeric.cpp DependencyPart.h \
DependencyWriter.cpp FactorTree.h \
$(TAGGER)/TokenDictionary.cpp $(TAGGER)/TokenDictionary.h \
$(SEQUENCE)/TokenDictionary.cpp $(SEQUENCE)/TokenDictionary.h \
$(CLASSIFIER)/Alphabet.cpp $(CLASSIFIER)/Dictionary.cpp \
$(CLASSIFIER)/Features.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Part.h \
$(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/SparseParameterVector.h \
Expand All @@ -29,6 +29,6 @@ $(UTIL)/StringUtils.h $(UTIL)/TimeUtils.h $(UTIL)/AlgUtils.h \
$(UTIL)/SerializationUtils.cpp $(UTIL)/StringUtils.cpp $(UTIL)/TimeUtils.cpp \
$(UTIL)/Utils.h

AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(TAGGER) $(CPPFLAGS)
AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(SEQUENCE) $(CPPFLAGS)
LDADD = $(LFLAGS)

22 changes: 11 additions & 11 deletions src/parser/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
UTIL = ../util
CLASSIFIER = ../classifier
TAGGER = ../tagger
SEQUENCE = ../sequence
TurboParserprgdir = ../..
TurboParser_SOURCES = DependencyDecoder.cpp DependencyFeatures.h \
DependencyInstanceNumeric.h DependencyPipe.cpp DependencyWriter.h \
Expand All @@ -183,7 +183,7 @@ DependencyReader.cpp FactorHeadAutomaton.h DependencyDictionary.h \
DependencyInstance.h DependencyPart.cpp DependencyReader.h FactorSequence.h \
DependencyFeatures.cpp DependencyInstanceNumeric.cpp DependencyPart.h \
DependencyWriter.cpp FactorTree.h \
$(TAGGER)/TokenDictionary.cpp $(TAGGER)/TokenDictionary.h \
$(SEQUENCE)/TokenDictionary.cpp $(SEQUENCE)/TokenDictionary.h \
$(CLASSIFIER)/Alphabet.cpp $(CLASSIFIER)/Dictionary.cpp \
$(CLASSIFIER)/Features.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Part.h \
$(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/SparseParameterVector.h \
Expand All @@ -198,7 +198,7 @@ $(UTIL)/StringUtils.h $(UTIL)/TimeUtils.h $(UTIL)/AlgUtils.h \
$(UTIL)/SerializationUtils.cpp $(UTIL)/StringUtils.cpp $(UTIL)/TimeUtils.cpp \
$(UTIL)/Utils.h

AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(TAGGER) $(CPPFLAGS)
AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(SEQUENCE) $(CPPFLAGS)
LDADD = $(LFLAGS)
all: all-am

Expand Down Expand Up @@ -319,19 +319,19 @@ distclean-compile:
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`

TokenDictionary.o: $(TAGGER)/TokenDictionary.cpp
@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.o -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.o `test -f '$(TAGGER)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(TAGGER)/TokenDictionary.cpp
TokenDictionary.o: $(SEQUENCE)/TokenDictionary.cpp
@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.o -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.o `test -f '$(SEQUENCE)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/TokenDictionary.cpp
@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/TokenDictionary.Tpo $(DEPDIR)/TokenDictionary.Po
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(TAGGER)/TokenDictionary.cpp' object='TokenDictionary.o' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/TokenDictionary.cpp' object='TokenDictionary.o' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.o `test -f '$(TAGGER)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(TAGGER)/TokenDictionary.cpp
@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.o `test -f '$(SEQUENCE)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/TokenDictionary.cpp

TokenDictionary.obj: $(TAGGER)/TokenDictionary.cpp
@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.obj -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.obj `if test -f '$(TAGGER)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(TAGGER)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(TAGGER)/TokenDictionary.cpp'; fi`
TokenDictionary.obj: $(SEQUENCE)/TokenDictionary.cpp
@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.obj -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.obj `if test -f '$(SEQUENCE)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/TokenDictionary.cpp'; fi`
@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/TokenDictionary.Tpo $(DEPDIR)/TokenDictionary.Po
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(TAGGER)/TokenDictionary.cpp' object='TokenDictionary.obj' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/TokenDictionary.cpp' object='TokenDictionary.obj' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.obj `if test -f '$(TAGGER)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(TAGGER)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(TAGGER)/TokenDictionary.cpp'; fi`
@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.obj `if test -f '$(SEQUENCE)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/TokenDictionary.cpp'; fi`

Alphabet.o: $(CLASSIFIER)/Alphabet.cpp
@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT Alphabet.o -MD -MP -MF $(DEPDIR)/Alphabet.Tpo -c -o Alphabet.o `test -f '$(CLASSIFIER)/Alphabet.cpp' || echo '$(srcdir)/'`$(CLASSIFIER)/Alphabet.cpp
Expand Down
6 changes: 3 additions & 3 deletions src/semantic_parser/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
UTIL = ../util
CLASSIFIER = ../classifier
TAGGER = ../tagger
SEQUENCE = ../sequence
PARSER = ../parser

TurboSemanticParserprgdir = ../..
Expand Down Expand Up @@ -30,7 +30,7 @@ $(PARSER)/DependencyInstance.cpp \
$(PARSER)/DependencyInstance.h \
$(PARSER)/DependencyReader.cpp \
$(PARSER)/DependencyReader.h \
$(TAGGER)/TokenDictionary.cpp $(TAGGER)/TokenDictionary.h \
$(SEQUENCE)/TokenDictionary.cpp $(SEQUENCE)/TokenDictionary.h \
$(CLASSIFIER)/Alphabet.cpp $(CLASSIFIER)/Dictionary.cpp \
$(CLASSIFIER)/Features.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Part.h \
$(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/SparseParameterVector.h \
Expand All @@ -45,6 +45,6 @@ $(UTIL)/StringUtils.h $(UTIL)/TimeUtils.h $(UTIL)/AlgUtils.h \
$(UTIL)/SerializationUtils.cpp $(UTIL)/StringUtils.cpp $(UTIL)/TimeUtils.cpp \
$(UTIL)/Utils.h

AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(TAGGER) -I$(PARSER) $(CPPFLAGS)
AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(SEQUENCE) -I$(PARSER) $(CPPFLAGS)
LDADD = $(LFLAGS)

22 changes: 11 additions & 11 deletions src/semantic_parser/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
UTIL = ../util
CLASSIFIER = ../classifier
TAGGER = ../tagger
SEQUENCE = ../sequence
PARSER = ../parser
TurboSemanticParserprgdir = ../..
TurboSemanticParser_SOURCES = SemanticDecoder.cpp SemanticFeatures.h \
Expand Down Expand Up @@ -202,7 +202,7 @@ $(PARSER)/DependencyInstance.cpp \
$(PARSER)/DependencyInstance.h \
$(PARSER)/DependencyReader.cpp \
$(PARSER)/DependencyReader.h \
$(TAGGER)/TokenDictionary.cpp $(TAGGER)/TokenDictionary.h \
$(SEQUENCE)/TokenDictionary.cpp $(SEQUENCE)/TokenDictionary.h \
$(CLASSIFIER)/Alphabet.cpp $(CLASSIFIER)/Dictionary.cpp \
$(CLASSIFIER)/Features.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Part.h \
$(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/SparseParameterVector.h \
Expand All @@ -217,7 +217,7 @@ $(UTIL)/StringUtils.h $(UTIL)/TimeUtils.h $(UTIL)/AlgUtils.h \
$(UTIL)/SerializationUtils.cpp $(UTIL)/StringUtils.cpp $(UTIL)/TimeUtils.cpp \
$(UTIL)/Utils.h

AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(TAGGER) -I$(PARSER) $(CPPFLAGS)
AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(SEQUENCE) -I$(PARSER) $(CPPFLAGS)
LDADD = $(LFLAGS)
all: all-am

Expand Down Expand Up @@ -413,19 +413,19 @@ DependencyReader.obj: $(PARSER)/DependencyReader.cpp
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o DependencyReader.obj `if test -f '$(PARSER)/DependencyReader.cpp'; then $(CYGPATH_W) '$(PARSER)/DependencyReader.cpp'; else $(CYGPATH_W) '$(srcdir)/$(PARSER)/DependencyReader.cpp'; fi`

TokenDictionary.o: $(TAGGER)/TokenDictionary.cpp
@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.o -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.o `test -f '$(TAGGER)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(TAGGER)/TokenDictionary.cpp
TokenDictionary.o: $(SEQUENCE)/TokenDictionary.cpp
@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.o -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.o `test -f '$(SEQUENCE)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/TokenDictionary.cpp
@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/TokenDictionary.Tpo $(DEPDIR)/TokenDictionary.Po
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(TAGGER)/TokenDictionary.cpp' object='TokenDictionary.o' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/TokenDictionary.cpp' object='TokenDictionary.o' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.o `test -f '$(TAGGER)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(TAGGER)/TokenDictionary.cpp
@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.o `test -f '$(SEQUENCE)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/TokenDictionary.cpp

TokenDictionary.obj: $(TAGGER)/TokenDictionary.cpp
@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.obj -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.obj `if test -f '$(TAGGER)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(TAGGER)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(TAGGER)/TokenDictionary.cpp'; fi`
TokenDictionary.obj: $(SEQUENCE)/TokenDictionary.cpp
@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.obj -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.obj `if test -f '$(SEQUENCE)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/TokenDictionary.cpp'; fi`
@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/TokenDictionary.Tpo $(DEPDIR)/TokenDictionary.Po
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(TAGGER)/TokenDictionary.cpp' object='TokenDictionary.obj' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/TokenDictionary.cpp' object='TokenDictionary.obj' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.obj `if test -f '$(TAGGER)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(TAGGER)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(TAGGER)/TokenDictionary.cpp'; fi`
@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.obj `if test -f '$(SEQUENCE)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/TokenDictionary.cpp'; fi`

Alphabet.o: $(CLASSIFIER)/Alphabet.cpp
@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT Alphabet.o -MD -MP -MF $(DEPDIR)/Alphabet.Tpo -c -o Alphabet.o `test -f '$(CLASSIFIER)/Alphabet.cpp' || echo '$(srcdir)/'`$(CLASSIFIER)/Alphabet.cpp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -383,8 +383,8 @@ void SequenceDecoder::ConvertToFirstOrderModel(
for (int l = 0; l < node_scores[i+2].size(); ++l, ++t) {
// Tag l at position i+2.
CHECK_LT(t, (*transformed_edge_scores)[i].size());
CHECK_LT(j, (*transformed_edge_scores)[i][t].size());
(*transformed_edge_scores)[i][t][j] =
CHECK_LT(j, (*transformed_edge_scores)[i][t].size());
(*transformed_edge_scores)[i][t][j] =
std::pair<int, double>(s, triplet_scores[i][j][k][l]);
}
}
Expand Down Expand Up @@ -489,7 +489,7 @@ double SequenceDecoder::RunViterbi(const vector<vector<double> > &node_scores,
int num_current_labels = node_scores[i+1].size();
deltas[i + 1].resize(num_current_labels);
backtrack[i + 1].resize(num_current_labels);
for (int k = 0; k < num_current_labels; ++k) {
for (int k = 0; k < num_current_labels; ++k) {
double best_value = -1e-12;
int best = -1;
// Edges from the previous position.
Expand Down
File renamed without changes.
52 changes: 52 additions & 0 deletions src/sequence/SequenceDictionary.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Copyright (c) 2012-2013 Andre Martins
// All Rights Reserved.
//
// This file is part of TurboParser 2.1.
//
// TurboParser 2.1 is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// TurboParser 2.1 is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with TurboParser 2.1. If not, see <http://www.gnu.org/licenses/>.

#include "SequenceDictionary.h"
#include "SequencePipe.h"
#include <algorithm>

void SequenceDictionary::CreateTagDictionary(SequenceReader *reader) {
LOG(INFO) << "Creating tag dictionary...";
vector<int> tag_freqs;

// Go through the corpus and build the label dictionary,
// counting the frequencies.
reader->Open(pipe_->GetOptions()->GetTrainingFilePath());
SequenceInstance *instance =
static_cast<SequenceInstance*>(reader->GetNext());
while (instance != NULL) {
int instance_length = instance->size();
for (int i = 0; i < instance_length; ++i) {
int id;

// Add tag to alphabet.
id = tag_alphabet_.Insert(instance->GetTag(i));
if (id >= tag_freqs.size()) {
CHECK_EQ(id, tag_freqs.size());
tag_freqs.push_back(0);
}
++tag_freqs[id];
}
delete instance;
instance = static_cast<SequenceInstance*>(reader->GetNext());
}
reader->Close();
tag_alphabet_.StopGrowth();

LOG(INFO) << "Number of tags: " << tag_alphabet_.size();
}
92 changes: 92 additions & 0 deletions src/sequence/SequenceDictionary.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// Copyright (c) 2012-2013 Andre Martins
// All Rights Reserved.
//
// This file is part of TurboParser 2.1.
//
// TurboParser 2.1 is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// TurboParser 2.1 is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with TurboParser 2.1. If not, see <http://www.gnu.org/licenses/>.

#ifndef SEQUENCEDICTIONARY_H_
#define SEQUENCEDICTIONARY_H_

#include "Dictionary.h"
#include "TokenDictionary.h"
#include "SerializationUtils.h"

class Pipe;

class SequenceDictionary : public Dictionary {
public:
SequenceDictionary() {}
SequenceDictionary(Pipe* pipe) : pipe_(pipe) {}
virtual ~SequenceDictionary() { Clear(); }

virtual void Clear() {
// Don't clear token_dictionary, since this class does not own it.
tag_alphabet_.clear();
}

virtual void Save(FILE *fs) {
if (0 > tag_alphabet_.Save(fs)) CHECK(false);
}

void Load(FILE *fs) {
if (0 > tag_alphabet_.Load(fs)) CHECK(false);
tag_alphabet_.BuildNames();
}

void AllowGrowth() { token_dictionary_->AllowGrowth(); }
void StopGrowth() { token_dictionary_->StopGrowth(); }

virtual void CreateTagDictionary(SequenceReader *reader);

void BuildTagNames() {
tag_alphabet_.BuildNames();
}

const string &GetTagName(int tag) const {
return tag_alphabet_.GetName(tag);
}

int GetBigramLabel(int left_tag, int tag) {
CHECK_GE(left_tag, -1);
CHECK_GE(tag, -1);
//return (left_tag * tag_alphabet_.size() + tag);
return ((1 + left_tag) * (1 + tag_alphabet_.size()) + (1 + tag));
}

int GetTrigramLabel(int left_left_tag, int left_tag, int tag) {
CHECK_GE(left_left_tag, -1);
CHECK_GE(left_tag, -1);
CHECK_GE(tag, -1);
//return (left_tag * left_tag * tag_alphabet_.size() +
// left_tag * tag_alphabet_.size() + tag);
return ((1 + left_left_tag) * (1 + tag_alphabet_.size()) *
(1 + tag_alphabet_.size()) +
(1 + left_tag) * (1 + tag_alphabet_.size()) + (1 + tag));
}

TokenDictionary *GetTokenDictionary() const { return token_dictionary_; }
void SetTokenDictionary(TokenDictionary *token_dictionary) {
token_dictionary_ = token_dictionary;
}

const Alphabet &GetTagAlphabet() const { return tag_alphabet_; };

protected:
Pipe *pipe_;
TokenDictionary *token_dictionary_;
Alphabet tag_alphabet_;
};

#endif /* SEQUENCEDICTIONARY_H_ */
Loading

0 comments on commit bedee9d

Please sign in to comment.