ENH Created an abstract layer for sequence models, and separated it f…

…rom the tagger.
Cocophotos · Sep 24, 2014 · bedee9d · bedee9d
1 parent 01b5c7a
commit bedee9d
Show file tree

Hide file tree

Showing 39 changed files with 862 additions and 253 deletions.
diff --git a/scripts/train_test_parser.sh b/scripts/train_test_parser.sh
@@ -23,7 +23,7 @@ large_feature_set=true # Use a large feature set (slower but more accurate).
 case_sensitive=false # Distinguish word upper/lower case.
 form_cutoff=0 # Cutoff in word occurrence.
 lemma_cutoff=0 # Cutoff in lemma occurrence.
-projective=false # If true, force single-rooted projective trees.
+projective=true #false # If true, force single-rooted projective trees.
 model_type=standard # Parts used in the model (subset of "af+cs+gp+as+hb+np+dp+gs+ts").
                     # Some shortcuts are: "standard" (means "af+cs+gp");
                     # "basic" (means "af"); and "full" (means "af+cs+gp+as+hb+gs+ts").

diff --git a/scripts/train_test_tagger.sh b/scripts/train_test_tagger.sh
@@ -9,7 +9,7 @@ language=$1 # Example: "slovene" or "english_proj".
 train_algorithm=svm_mira # Training algorithm.
 num_epochs=10 # Number of training epochs.
 regularization_parameter=1e12 # The C parameter in MIRA.
-train=true
+train=false #true
 test=true
 model_type=2 # Second-order model (trigrams).
 form_cutoff=1 # Word cutoff. Only words which occur more than these times won't be considered unknown.
@@ -75,7 +75,7 @@ then
         --file_train=${file_train} \
         --train_algorithm=${train_algorithm} \
         --train_regularization_constant=${regularization_parameter} \
-        --tagger_model_type=${model_type} \
+        --sequence_model_type=${model_type} \
         --form_cutoff=${form_cutoff} \
         --logtostderr
 fi

diff --git a/src/parser/Makefile.am b/src/parser/Makefile.am
@@ -1,6 +1,6 @@
 UTIL = ../util
 CLASSIFIER = ../classifier
-TAGGER = ../tagger
+SEQUENCE = ../sequence
 
 TurboParserprgdir = ../..
 TurboParserprg_PROGRAMS = TurboParser
@@ -14,7 +14,7 @@ DependencyReader.cpp FactorHeadAutomaton.h DependencyDictionary.h \
 DependencyInstance.h DependencyPart.cpp DependencyReader.h FactorSequence.h \
 DependencyFeatures.cpp DependencyInstanceNumeric.cpp DependencyPart.h \
 DependencyWriter.cpp FactorTree.h \
-$(TAGGER)/TokenDictionary.cpp $(TAGGER)/TokenDictionary.h \
+$(SEQUENCE)/TokenDictionary.cpp $(SEQUENCE)/TokenDictionary.h \
 $(CLASSIFIER)/Alphabet.cpp $(CLASSIFIER)/Dictionary.cpp \
 $(CLASSIFIER)/Features.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Part.h \
 $(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/SparseParameterVector.h \
@@ -29,6 +29,6 @@ $(UTIL)/StringUtils.h $(UTIL)/TimeUtils.h $(UTIL)/AlgUtils.h \
 $(UTIL)/SerializationUtils.cpp $(UTIL)/StringUtils.cpp $(UTIL)/TimeUtils.cpp \
 $(UTIL)/Utils.h
 
-AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(TAGGER) $(CPPFLAGS)
+AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(SEQUENCE) $(CPPFLAGS)
 LDADD = $(LFLAGS)
 
diff --git a/src/parser/Makefile.in b/src/parser/Makefile.in
@@ -171,7 +171,7 @@ top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 UTIL = ../util
 CLASSIFIER = ../classifier
-TAGGER = ../tagger
+SEQUENCE = ../sequence
 TurboParserprgdir = ../..
 TurboParser_SOURCES = DependencyDecoder.cpp DependencyFeatures.h \
 DependencyInstanceNumeric.h DependencyPipe.cpp DependencyWriter.h \
@@ -183,7 +183,7 @@ DependencyReader.cpp FactorHeadAutomaton.h DependencyDictionary.h \
 DependencyInstance.h DependencyPart.cpp DependencyReader.h FactorSequence.h \
 DependencyFeatures.cpp DependencyInstanceNumeric.cpp DependencyPart.h \
 DependencyWriter.cpp FactorTree.h \
-$(TAGGER)/TokenDictionary.cpp $(TAGGER)/TokenDictionary.h \
+$(SEQUENCE)/TokenDictionary.cpp $(SEQUENCE)/TokenDictionary.h \
 $(CLASSIFIER)/Alphabet.cpp $(CLASSIFIER)/Dictionary.cpp \
 $(CLASSIFIER)/Features.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Part.h \
 $(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/SparseParameterVector.h \
@@ -198,7 +198,7 @@ $(UTIL)/StringUtils.h $(UTIL)/TimeUtils.h $(UTIL)/AlgUtils.h \
 $(UTIL)/SerializationUtils.cpp $(UTIL)/StringUtils.cpp $(UTIL)/TimeUtils.cpp \
 $(UTIL)/Utils.h
 
-AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(TAGGER) $(CPPFLAGS)
+AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(SEQUENCE) $(CPPFLAGS)
 LDADD = $(LFLAGS)
 all: all-am
 
@@ -319,19 +319,19 @@ distclean-compile:
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
 
-TokenDictionary.o: $(TAGGER)/TokenDictionary.cpp
-@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.o -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.o `test -f '$(TAGGER)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(TAGGER)/TokenDictionary.cpp
+TokenDictionary.o: $(SEQUENCE)/TokenDictionary.cpp
+@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.o -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.o `test -f '$(SEQUENCE)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/TokenDictionary.cpp
 @am__fastdepCXX_TRUE@	$(am__mv) $(DEPDIR)/TokenDictionary.Tpo $(DEPDIR)/TokenDictionary.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$(TAGGER)/TokenDictionary.cpp' object='TokenDictionary.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$(SEQUENCE)/TokenDictionary.cpp' object='TokenDictionary.o' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.o `test -f '$(TAGGER)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(TAGGER)/TokenDictionary.cpp
+@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.o `test -f '$(SEQUENCE)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/TokenDictionary.cpp
 
-TokenDictionary.obj: $(TAGGER)/TokenDictionary.cpp
-@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.obj -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.obj `if test -f '$(TAGGER)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(TAGGER)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(TAGGER)/TokenDictionary.cpp'; fi`
+TokenDictionary.obj: $(SEQUENCE)/TokenDictionary.cpp
+@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.obj -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.obj `if test -f '$(SEQUENCE)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/TokenDictionary.cpp'; fi`
 @am__fastdepCXX_TRUE@	$(am__mv) $(DEPDIR)/TokenDictionary.Tpo $(DEPDIR)/TokenDictionary.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$(TAGGER)/TokenDictionary.cpp' object='TokenDictionary.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$(SEQUENCE)/TokenDictionary.cpp' object='TokenDictionary.obj' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.obj `if test -f '$(TAGGER)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(TAGGER)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(TAGGER)/TokenDictionary.cpp'; fi`
+@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.obj `if test -f '$(SEQUENCE)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/TokenDictionary.cpp'; fi`
 
 Alphabet.o: $(CLASSIFIER)/Alphabet.cpp
 @am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT Alphabet.o -MD -MP -MF $(DEPDIR)/Alphabet.Tpo -c -o Alphabet.o `test -f '$(CLASSIFIER)/Alphabet.cpp' || echo '$(srcdir)/'`$(CLASSIFIER)/Alphabet.cpp

diff --git a/src/semantic_parser/Makefile.am b/src/semantic_parser/Makefile.am
@@ -1,6 +1,6 @@
 UTIL = ../util
 CLASSIFIER = ../classifier
-TAGGER = ../tagger
+SEQUENCE = ../sequence
 PARSER = ../parser
 
 TurboSemanticParserprgdir = ../..
@@ -30,7 +30,7 @@ $(PARSER)/DependencyInstance.cpp \
 $(PARSER)/DependencyInstance.h \
 $(PARSER)/DependencyReader.cpp \
 $(PARSER)/DependencyReader.h \
-$(TAGGER)/TokenDictionary.cpp $(TAGGER)/TokenDictionary.h \
+$(SEQUENCE)/TokenDictionary.cpp $(SEQUENCE)/TokenDictionary.h \
 $(CLASSIFIER)/Alphabet.cpp $(CLASSIFIER)/Dictionary.cpp \
 $(CLASSIFIER)/Features.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Part.h \
 $(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/SparseParameterVector.h \
@@ -45,6 +45,6 @@ $(UTIL)/StringUtils.h $(UTIL)/TimeUtils.h $(UTIL)/AlgUtils.h \
 $(UTIL)/SerializationUtils.cpp $(UTIL)/StringUtils.cpp $(UTIL)/TimeUtils.cpp \
 $(UTIL)/Utils.h
 
-AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(TAGGER) -I$(PARSER) $(CPPFLAGS)
+AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(SEQUENCE) -I$(PARSER) $(CPPFLAGS)
 LDADD = $(LFLAGS)
 
diff --git a/src/semantic_parser/Makefile.in b/src/semantic_parser/Makefile.in
@@ -174,7 +174,7 @@ top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 UTIL = ../util
 CLASSIFIER = ../classifier
-TAGGER = ../tagger
+SEQUENCE = ../sequence
 PARSER = ../parser
 TurboSemanticParserprgdir = ../..
 TurboSemanticParser_SOURCES = SemanticDecoder.cpp SemanticFeatures.h \
@@ -202,7 +202,7 @@ $(PARSER)/DependencyInstance.cpp \
 $(PARSER)/DependencyInstance.h \
 $(PARSER)/DependencyReader.cpp \
 $(PARSER)/DependencyReader.h \
-$(TAGGER)/TokenDictionary.cpp $(TAGGER)/TokenDictionary.h \
+$(SEQUENCE)/TokenDictionary.cpp $(SEQUENCE)/TokenDictionary.h \
 $(CLASSIFIER)/Alphabet.cpp $(CLASSIFIER)/Dictionary.cpp \
 $(CLASSIFIER)/Features.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Part.h \
 $(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/SparseParameterVector.h \
@@ -217,7 +217,7 @@ $(UTIL)/StringUtils.h $(UTIL)/TimeUtils.h $(UTIL)/AlgUtils.h \
 $(UTIL)/SerializationUtils.cpp $(UTIL)/StringUtils.cpp $(UTIL)/TimeUtils.cpp \
 $(UTIL)/Utils.h
 
-AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(TAGGER) -I$(PARSER) $(CPPFLAGS)
+AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(SEQUENCE) -I$(PARSER) $(CPPFLAGS)
 LDADD = $(LFLAGS)
 all: all-am
 
@@ -413,19 +413,19 @@ DependencyReader.obj: $(PARSER)/DependencyReader.cpp
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o DependencyReader.obj `if test -f '$(PARSER)/DependencyReader.cpp'; then $(CYGPATH_W) '$(PARSER)/DependencyReader.cpp'; else $(CYGPATH_W) '$(srcdir)/$(PARSER)/DependencyReader.cpp'; fi`
 
-TokenDictionary.o: $(TAGGER)/TokenDictionary.cpp
-@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.o -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.o `test -f '$(TAGGER)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(TAGGER)/TokenDictionary.cpp
+TokenDictionary.o: $(SEQUENCE)/TokenDictionary.cpp
+@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.o -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.o `test -f '$(SEQUENCE)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/TokenDictionary.cpp
 @am__fastdepCXX_TRUE@	$(am__mv) $(DEPDIR)/TokenDictionary.Tpo $(DEPDIR)/TokenDictionary.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$(TAGGER)/TokenDictionary.cpp' object='TokenDictionary.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$(SEQUENCE)/TokenDictionary.cpp' object='TokenDictionary.o' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.o `test -f '$(TAGGER)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(TAGGER)/TokenDictionary.cpp
+@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.o `test -f '$(SEQUENCE)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/TokenDictionary.cpp
 
-TokenDictionary.obj: $(TAGGER)/TokenDictionary.cpp
-@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.obj -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.obj `if test -f '$(TAGGER)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(TAGGER)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(TAGGER)/TokenDictionary.cpp'; fi`
+TokenDictionary.obj: $(SEQUENCE)/TokenDictionary.cpp
+@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.obj -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.obj `if test -f '$(SEQUENCE)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/TokenDictionary.cpp'; fi`
 @am__fastdepCXX_TRUE@	$(am__mv) $(DEPDIR)/TokenDictionary.Tpo $(DEPDIR)/TokenDictionary.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$(TAGGER)/TokenDictionary.cpp' object='TokenDictionary.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$(SEQUENCE)/TokenDictionary.cpp' object='TokenDictionary.obj' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.obj `if test -f '$(TAGGER)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(TAGGER)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(TAGGER)/TokenDictionary.cpp'; fi`
+@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.obj `if test -f '$(SEQUENCE)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/TokenDictionary.cpp'; fi`
 
 Alphabet.o: $(CLASSIFIER)/Alphabet.cpp
 @am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT Alphabet.o -MD -MP -MF $(DEPDIR)/Alphabet.Tpo -c -o Alphabet.o `test -f '$(CLASSIFIER)/Alphabet.cpp' || echo '$(srcdir)/'`$(CLASSIFIER)/Alphabet.cpp

diff --git a/src/tagger/SequenceDecoder.cpp → src/sequence/SequenceDecoder.cpp b/src/tagger/SequenceDecoder.cpp → src/sequence/SequenceDecoder.cpp
@@ -383,8 +383,8 @@ void SequenceDecoder::ConvertToFirstOrderModel(
         for (int l = 0; l < node_scores[i+2].size(); ++l, ++t) {
           // Tag l at position i+2.
           CHECK_LT(t, (*transformed_edge_scores)[i].size());
-          CHECK_LT(j, (*transformed_edge_scores)[i][t].size());          
-          (*transformed_edge_scores)[i][t][j] = 
+          CHECK_LT(j, (*transformed_edge_scores)[i][t].size());
+          (*transformed_edge_scores)[i][t][j] =
             std::pair<int, double>(s, triplet_scores[i][j][k][l]);
         }
       }
@@ -489,7 +489,7 @@ double SequenceDecoder::RunViterbi(const vector<vector<double> > &node_scores,
     int num_current_labels = node_scores[i+1].size();
     deltas[i + 1].resize(num_current_labels);
     backtrack[i + 1].resize(num_current_labels);
-	for (int k = 0; k < num_current_labels; ++k) {
+    for (int k = 0; k < num_current_labels; ++k) {
       double best_value = -1e-12;
       int best = -1;
       // Edges from the previous position.

diff --git a/src/tagger/SequenceDecoder.h → src/sequence/SequenceDecoder.h b/src/tagger/SequenceDecoder.h → src/sequence/SequenceDecoder.h
diff --git a/src/sequence/SequenceDictionary.cpp b/src/sequence/SequenceDictionary.cpp
@@ -0,0 +1,52 @@
+// Copyright (c) 2012-2013 Andre Martins
+// All Rights Reserved.
+//
+// This file is part of TurboParser 2.1.
+//
+// TurboParser 2.1 is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// TurboParser 2.1 is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with TurboParser 2.1.  If not, see <http://www.gnu.org/licenses/>.
+
+#include "SequenceDictionary.h"
+#include "SequencePipe.h"
+#include <algorithm>
+
+void SequenceDictionary::CreateTagDictionary(SequenceReader *reader) {
+  LOG(INFO) << "Creating tag dictionary...";
+  vector<int> tag_freqs;
+
+  // Go through the corpus and build the label dictionary,
+  // counting the frequencies.
+  reader->Open(pipe_->GetOptions()->GetTrainingFilePath());
+  SequenceInstance *instance =
+    static_cast<SequenceInstance*>(reader->GetNext());
+  while (instance != NULL) {
+    int instance_length = instance->size();
+    for (int i = 0; i < instance_length; ++i) {
+      int id;
+
+      // Add tag to alphabet.
+      id = tag_alphabet_.Insert(instance->GetTag(i));
+      if (id >= tag_freqs.size()) {
+        CHECK_EQ(id, tag_freqs.size());
+        tag_freqs.push_back(0);
+      }
+      ++tag_freqs[id];
+    }
+    delete instance;
+    instance = static_cast<SequenceInstance*>(reader->GetNext());
+  }
+  reader->Close();
+  tag_alphabet_.StopGrowth();
+
+  LOG(INFO) << "Number of tags: " << tag_alphabet_.size();
+}
diff --git a/src/sequence/SequenceDictionary.h b/src/sequence/SequenceDictionary.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2012-2013 Andre Martins
+// All Rights Reserved.
+//
+// This file is part of TurboParser 2.1.
+//
+// TurboParser 2.1 is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// TurboParser 2.1 is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with TurboParser 2.1.  If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef SEQUENCEDICTIONARY_H_
+#define SEQUENCEDICTIONARY_H_
+
+#include "Dictionary.h"
+#include "TokenDictionary.h"
+#include "SerializationUtils.h"
+
+class Pipe;
+
+class SequenceDictionary : public Dictionary {
+ public:
+  SequenceDictionary() {}
+  SequenceDictionary(Pipe* pipe) : pipe_(pipe) {}
+  virtual ~SequenceDictionary() { Clear(); }
+
+  virtual void Clear() {
+    // Don't clear token_dictionary, since this class does not own it.
+    tag_alphabet_.clear();
+  }
+
+  virtual void Save(FILE *fs) {
+    if (0 > tag_alphabet_.Save(fs)) CHECK(false);
+  }
+
+  void Load(FILE *fs) {
+    if (0 > tag_alphabet_.Load(fs)) CHECK(false);
+    tag_alphabet_.BuildNames();
+  }
+
+  void AllowGrowth() { token_dictionary_->AllowGrowth(); }
+  void StopGrowth() { token_dictionary_->StopGrowth(); }
+
+  virtual void CreateTagDictionary(SequenceReader *reader);
+
+  void BuildTagNames() {
+    tag_alphabet_.BuildNames();
+  }
+
+  const string &GetTagName(int tag) const {
+    return tag_alphabet_.GetName(tag);
+  }
+
+  int GetBigramLabel(int left_tag, int tag) {
+    CHECK_GE(left_tag, -1);
+    CHECK_GE(tag, -1);
+    //return (left_tag * tag_alphabet_.size() +  tag);
+    return ((1 + left_tag) * (1 + tag_alphabet_.size()) +  (1 + tag));
+  }
+
+  int GetTrigramLabel(int left_left_tag, int left_tag, int tag) {
+    CHECK_GE(left_left_tag, -1);
+    CHECK_GE(left_tag, -1);
+    CHECK_GE(tag, -1);
+    //return (left_tag * left_tag * tag_alphabet_.size() + 
+    //        left_tag * tag_alphabet_.size() +  tag);
+    return ((1 + left_left_tag) * (1 + tag_alphabet_.size()) *
+            (1 + tag_alphabet_.size()) + 
+            (1 + left_tag) * (1 + tag_alphabet_.size()) + (1 + tag));
+  }
+
+  TokenDictionary *GetTokenDictionary() const { return token_dictionary_; }
+  void SetTokenDictionary(TokenDictionary *token_dictionary) {
+    token_dictionary_ = token_dictionary;
+  }
+
+  const Alphabet &GetTagAlphabet() const { return tag_alphabet_; };
+
+ protected:
+  Pipe *pipe_;
+  TokenDictionary *token_dictionary_;
+  Alphabet tag_alphabet_;
+};
+
+#endif /* SEQUENCEDICTIONARY_H_ */