From bedee9d300e50542f3ae6aa71d0ba8e48e857bb3 Mon Sep 17 00:00:00 2001 From: Andre Martins Date: Wed, 24 Sep 2014 16:49:52 +0100 Subject: [PATCH] ENH Created an abstract layer for sequence models, and separated it from the tagger. --- scripts/train_test_parser.sh | 2 +- scripts/train_test_tagger.sh | 4 +- src/parser/Makefile.am | 6 +- src/parser/Makefile.in | 22 +- src/semantic_parser/Makefile.am | 6 +- src/semantic_parser/Makefile.in | 22 +- src/{tagger => sequence}/SequenceDecoder.cpp | 6 +- src/{tagger => sequence}/SequenceDecoder.h | 0 src/sequence/SequenceDictionary.cpp | 52 +++++ src/sequence/SequenceDictionary.h | 92 +++++++++ src/{tagger => sequence}/SequenceFeatures.h | 32 +-- src/{tagger => sequence}/SequenceInstance.cpp | 0 src/{tagger => sequence}/SequenceInstance.h | 0 .../SequenceInstanceNumeric.cpp | 0 .../SequenceInstanceNumeric.h | 0 src/sequence/SequenceOptions.cpp | 81 ++++++++ src/{tagger => sequence}/SequenceOptions.h | 15 +- src/{tagger => sequence}/SequencePart.cpp | 0 src/{tagger => sequence}/SequencePart.h | 0 src/{tagger => sequence}/SequencePipe.cpp | 15 +- src/{tagger => sequence}/SequencePipe.h | 24 ++- src/{tagger => sequence}/SequenceReader.cpp | 0 src/{tagger => sequence}/SequenceReader.h | 0 src/{tagger => sequence}/SequenceWriter.cpp | 0 src/{tagger => sequence}/SequenceWriter.h | 0 src/{tagger => sequence}/TokenDictionary.cpp | 0 src/{tagger => sequence}/TokenDictionary.h | 0 src/tagger/Makefile.am | 28 ++- src/tagger/Makefile.in | 190 ++++++++++++++++-- ...nceDictionary.cpp => TaggerDictionary.cpp} | 51 ++--- ...equenceDictionary.h => TaggerDictionary.h} | 68 +------ ...reTemplates.h => TaggerFeatureTemplates.h} | 16 +- ...equenceFeatures.cpp => TaggerFeatures.cpp} | 52 ++--- src/tagger/TaggerFeatures.h | 49 +++++ ...{SequenceOptions.cpp => TaggerOptions.cpp} | 30 +-- src/tagger/TaggerOptions.h | 50 +++++ src/tagger/TaggerPipe.cpp | 28 +++ src/tagger/TaggerPipe.h | 164 +++++++++++++++ src/tagger/TurboTagger.cpp | 10 +- 39 files changed, 862 insertions(+), 253 deletions(-) rename src/{tagger => sequence}/SequenceDecoder.cpp (99%) rename src/{tagger => sequence}/SequenceDecoder.h (100%) create mode 100644 src/sequence/SequenceDictionary.cpp create mode 100644 src/sequence/SequenceDictionary.h rename src/{tagger => sequence}/SequenceFeatures.h (79%) rename src/{tagger => sequence}/SequenceInstance.cpp (100%) rename src/{tagger => sequence}/SequenceInstance.h (100%) rename src/{tagger => sequence}/SequenceInstanceNumeric.cpp (100%) rename src/{tagger => sequence}/SequenceInstanceNumeric.h (100%) create mode 100644 src/sequence/SequenceOptions.cpp rename src/{tagger => sequence}/SequenceOptions.h (76%) rename src/{tagger => sequence}/SequencePart.cpp (100%) rename src/{tagger => sequence}/SequencePart.h (100%) rename src/{tagger => sequence}/SequencePipe.cpp (98%) rename src/{tagger => sequence}/SequencePipe.h (88%) rename src/{tagger => sequence}/SequenceReader.cpp (100%) rename src/{tagger => sequence}/SequenceReader.h (100%) rename src/{tagger => sequence}/SequenceWriter.cpp (100%) rename src/{tagger => sequence}/SequenceWriter.h (100%) rename src/{tagger => sequence}/TokenDictionary.cpp (100%) rename src/{tagger => sequence}/TokenDictionary.h (100%) rename src/tagger/{SequenceDictionary.cpp => TaggerDictionary.cpp} (71%) rename src/tagger/{SequenceDictionary.h => TaggerDictionary.h} (60%) rename src/tagger/{SequenceFeatureTemplates.h => TaggerFeatureTemplates.h} (82%) rename src/tagger/{SequenceFeatures.cpp => TaggerFeatures.cpp} (66%) create mode 100644 src/tagger/TaggerFeatures.h rename src/tagger/{SequenceOptions.cpp => TaggerOptions.cpp} (81%) create mode 100644 src/tagger/TaggerOptions.h create mode 100644 src/tagger/TaggerPipe.cpp create mode 100644 src/tagger/TaggerPipe.h diff --git a/scripts/train_test_parser.sh b/scripts/train_test_parser.sh index d567421..7f81851 100755 --- a/scripts/train_test_parser.sh +++ b/scripts/train_test_parser.sh @@ -23,7 +23,7 @@ large_feature_set=true # Use a large feature set (slower but more accurate). case_sensitive=false # Distinguish word upper/lower case. form_cutoff=0 # Cutoff in word occurrence. lemma_cutoff=0 # Cutoff in lemma occurrence. -projective=false # If true, force single-rooted projective trees. +projective=true #false # If true, force single-rooted projective trees. model_type=standard # Parts used in the model (subset of "af+cs+gp+as+hb+np+dp+gs+ts"). # Some shortcuts are: "standard" (means "af+cs+gp"); # "basic" (means "af"); and "full" (means "af+cs+gp+as+hb+gs+ts"). diff --git a/scripts/train_test_tagger.sh b/scripts/train_test_tagger.sh index 510a64a..de5e8a6 100755 --- a/scripts/train_test_tagger.sh +++ b/scripts/train_test_tagger.sh @@ -9,7 +9,7 @@ language=$1 # Example: "slovene" or "english_proj". train_algorithm=svm_mira # Training algorithm. num_epochs=10 # Number of training epochs. regularization_parameter=1e12 # The C parameter in MIRA. -train=true +train=false #true test=true model_type=2 # Second-order model (trigrams). form_cutoff=1 # Word cutoff. Only words which occur more than these times won't be considered unknown. @@ -75,7 +75,7 @@ then --file_train=${file_train} \ --train_algorithm=${train_algorithm} \ --train_regularization_constant=${regularization_parameter} \ - --tagger_model_type=${model_type} \ + --sequence_model_type=${model_type} \ --form_cutoff=${form_cutoff} \ --logtostderr fi diff --git a/src/parser/Makefile.am b/src/parser/Makefile.am index 26e3534..436b3d6 100644 --- a/src/parser/Makefile.am +++ b/src/parser/Makefile.am @@ -1,6 +1,6 @@ UTIL = ../util CLASSIFIER = ../classifier -TAGGER = ../tagger +SEQUENCE = ../sequence TurboParserprgdir = ../.. TurboParserprg_PROGRAMS = TurboParser @@ -14,7 +14,7 @@ DependencyReader.cpp FactorHeadAutomaton.h DependencyDictionary.h \ DependencyInstance.h DependencyPart.cpp DependencyReader.h FactorSequence.h \ DependencyFeatures.cpp DependencyInstanceNumeric.cpp DependencyPart.h \ DependencyWriter.cpp FactorTree.h \ -$(TAGGER)/TokenDictionary.cpp $(TAGGER)/TokenDictionary.h \ +$(SEQUENCE)/TokenDictionary.cpp $(SEQUENCE)/TokenDictionary.h \ $(CLASSIFIER)/Alphabet.cpp $(CLASSIFIER)/Dictionary.cpp \ $(CLASSIFIER)/Features.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Part.h \ $(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/SparseParameterVector.h \ @@ -29,6 +29,6 @@ $(UTIL)/StringUtils.h $(UTIL)/TimeUtils.h $(UTIL)/AlgUtils.h \ $(UTIL)/SerializationUtils.cpp $(UTIL)/StringUtils.cpp $(UTIL)/TimeUtils.cpp \ $(UTIL)/Utils.h -AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(TAGGER) $(CPPFLAGS) +AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(SEQUENCE) $(CPPFLAGS) LDADD = $(LFLAGS) diff --git a/src/parser/Makefile.in b/src/parser/Makefile.in index a64914c..5aeacaf 100644 --- a/src/parser/Makefile.in +++ b/src/parser/Makefile.in @@ -171,7 +171,7 @@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ UTIL = ../util CLASSIFIER = ../classifier -TAGGER = ../tagger +SEQUENCE = ../sequence TurboParserprgdir = ../.. TurboParser_SOURCES = DependencyDecoder.cpp DependencyFeatures.h \ DependencyInstanceNumeric.h DependencyPipe.cpp DependencyWriter.h \ @@ -183,7 +183,7 @@ DependencyReader.cpp FactorHeadAutomaton.h DependencyDictionary.h \ DependencyInstance.h DependencyPart.cpp DependencyReader.h FactorSequence.h \ DependencyFeatures.cpp DependencyInstanceNumeric.cpp DependencyPart.h \ DependencyWriter.cpp FactorTree.h \ -$(TAGGER)/TokenDictionary.cpp $(TAGGER)/TokenDictionary.h \ +$(SEQUENCE)/TokenDictionary.cpp $(SEQUENCE)/TokenDictionary.h \ $(CLASSIFIER)/Alphabet.cpp $(CLASSIFIER)/Dictionary.cpp \ $(CLASSIFIER)/Features.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Part.h \ $(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/SparseParameterVector.h \ @@ -198,7 +198,7 @@ $(UTIL)/StringUtils.h $(UTIL)/TimeUtils.h $(UTIL)/AlgUtils.h \ $(UTIL)/SerializationUtils.cpp $(UTIL)/StringUtils.cpp $(UTIL)/TimeUtils.cpp \ $(UTIL)/Utils.h -AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(TAGGER) $(CPPFLAGS) +AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(SEQUENCE) $(CPPFLAGS) LDADD = $(LFLAGS) all: all-am @@ -319,19 +319,19 @@ distclean-compile: @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` -TokenDictionary.o: $(TAGGER)/TokenDictionary.cpp -@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.o -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.o `test -f '$(TAGGER)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(TAGGER)/TokenDictionary.cpp +TokenDictionary.o: $(SEQUENCE)/TokenDictionary.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.o -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.o `test -f '$(SEQUENCE)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/TokenDictionary.cpp @am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/TokenDictionary.Tpo $(DEPDIR)/TokenDictionary.Po -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(TAGGER)/TokenDictionary.cpp' object='TokenDictionary.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/TokenDictionary.cpp' object='TokenDictionary.o' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.o `test -f '$(TAGGER)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(TAGGER)/TokenDictionary.cpp +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.o `test -f '$(SEQUENCE)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/TokenDictionary.cpp -TokenDictionary.obj: $(TAGGER)/TokenDictionary.cpp -@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.obj -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.obj `if test -f '$(TAGGER)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(TAGGER)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(TAGGER)/TokenDictionary.cpp'; fi` +TokenDictionary.obj: $(SEQUENCE)/TokenDictionary.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.obj -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.obj `if test -f '$(SEQUENCE)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/TokenDictionary.cpp'; fi` @am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/TokenDictionary.Tpo $(DEPDIR)/TokenDictionary.Po -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(TAGGER)/TokenDictionary.cpp' object='TokenDictionary.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/TokenDictionary.cpp' object='TokenDictionary.obj' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.obj `if test -f '$(TAGGER)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(TAGGER)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(TAGGER)/TokenDictionary.cpp'; fi` +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.obj `if test -f '$(SEQUENCE)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/TokenDictionary.cpp'; fi` Alphabet.o: $(CLASSIFIER)/Alphabet.cpp @am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT Alphabet.o -MD -MP -MF $(DEPDIR)/Alphabet.Tpo -c -o Alphabet.o `test -f '$(CLASSIFIER)/Alphabet.cpp' || echo '$(srcdir)/'`$(CLASSIFIER)/Alphabet.cpp diff --git a/src/semantic_parser/Makefile.am b/src/semantic_parser/Makefile.am index 113f36b..165b410 100644 --- a/src/semantic_parser/Makefile.am +++ b/src/semantic_parser/Makefile.am @@ -1,6 +1,6 @@ UTIL = ../util CLASSIFIER = ../classifier -TAGGER = ../tagger +SEQUENCE = ../sequence PARSER = ../parser TurboSemanticParserprgdir = ../.. @@ -30,7 +30,7 @@ $(PARSER)/DependencyInstance.cpp \ $(PARSER)/DependencyInstance.h \ $(PARSER)/DependencyReader.cpp \ $(PARSER)/DependencyReader.h \ -$(TAGGER)/TokenDictionary.cpp $(TAGGER)/TokenDictionary.h \ +$(SEQUENCE)/TokenDictionary.cpp $(SEQUENCE)/TokenDictionary.h \ $(CLASSIFIER)/Alphabet.cpp $(CLASSIFIER)/Dictionary.cpp \ $(CLASSIFIER)/Features.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Part.h \ $(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/SparseParameterVector.h \ @@ -45,6 +45,6 @@ $(UTIL)/StringUtils.h $(UTIL)/TimeUtils.h $(UTIL)/AlgUtils.h \ $(UTIL)/SerializationUtils.cpp $(UTIL)/StringUtils.cpp $(UTIL)/TimeUtils.cpp \ $(UTIL)/Utils.h -AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(TAGGER) -I$(PARSER) $(CPPFLAGS) +AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(SEQUENCE) -I$(PARSER) $(CPPFLAGS) LDADD = $(LFLAGS) diff --git a/src/semantic_parser/Makefile.in b/src/semantic_parser/Makefile.in index 743ba41..f35d126 100644 --- a/src/semantic_parser/Makefile.in +++ b/src/semantic_parser/Makefile.in @@ -174,7 +174,7 @@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ UTIL = ../util CLASSIFIER = ../classifier -TAGGER = ../tagger +SEQUENCE = ../sequence PARSER = ../parser TurboSemanticParserprgdir = ../.. TurboSemanticParser_SOURCES = SemanticDecoder.cpp SemanticFeatures.h \ @@ -202,7 +202,7 @@ $(PARSER)/DependencyInstance.cpp \ $(PARSER)/DependencyInstance.h \ $(PARSER)/DependencyReader.cpp \ $(PARSER)/DependencyReader.h \ -$(TAGGER)/TokenDictionary.cpp $(TAGGER)/TokenDictionary.h \ +$(SEQUENCE)/TokenDictionary.cpp $(SEQUENCE)/TokenDictionary.h \ $(CLASSIFIER)/Alphabet.cpp $(CLASSIFIER)/Dictionary.cpp \ $(CLASSIFIER)/Features.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Part.h \ $(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/SparseParameterVector.h \ @@ -217,7 +217,7 @@ $(UTIL)/StringUtils.h $(UTIL)/TimeUtils.h $(UTIL)/AlgUtils.h \ $(UTIL)/SerializationUtils.cpp $(UTIL)/StringUtils.cpp $(UTIL)/TimeUtils.cpp \ $(UTIL)/Utils.h -AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(TAGGER) -I$(PARSER) $(CPPFLAGS) +AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(SEQUENCE) -I$(PARSER) $(CPPFLAGS) LDADD = $(LFLAGS) all: all-am @@ -413,19 +413,19 @@ DependencyReader.obj: $(PARSER)/DependencyReader.cpp @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o DependencyReader.obj `if test -f '$(PARSER)/DependencyReader.cpp'; then $(CYGPATH_W) '$(PARSER)/DependencyReader.cpp'; else $(CYGPATH_W) '$(srcdir)/$(PARSER)/DependencyReader.cpp'; fi` -TokenDictionary.o: $(TAGGER)/TokenDictionary.cpp -@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.o -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.o `test -f '$(TAGGER)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(TAGGER)/TokenDictionary.cpp +TokenDictionary.o: $(SEQUENCE)/TokenDictionary.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.o -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.o `test -f '$(SEQUENCE)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/TokenDictionary.cpp @am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/TokenDictionary.Tpo $(DEPDIR)/TokenDictionary.Po -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(TAGGER)/TokenDictionary.cpp' object='TokenDictionary.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/TokenDictionary.cpp' object='TokenDictionary.o' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.o `test -f '$(TAGGER)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(TAGGER)/TokenDictionary.cpp +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.o `test -f '$(SEQUENCE)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/TokenDictionary.cpp -TokenDictionary.obj: $(TAGGER)/TokenDictionary.cpp -@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.obj -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.obj `if test -f '$(TAGGER)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(TAGGER)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(TAGGER)/TokenDictionary.cpp'; fi` +TokenDictionary.obj: $(SEQUENCE)/TokenDictionary.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.obj -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.obj `if test -f '$(SEQUENCE)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/TokenDictionary.cpp'; fi` @am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/TokenDictionary.Tpo $(DEPDIR)/TokenDictionary.Po -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(TAGGER)/TokenDictionary.cpp' object='TokenDictionary.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/TokenDictionary.cpp' object='TokenDictionary.obj' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.obj `if test -f '$(TAGGER)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(TAGGER)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(TAGGER)/TokenDictionary.cpp'; fi` +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.obj `if test -f '$(SEQUENCE)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/TokenDictionary.cpp'; fi` Alphabet.o: $(CLASSIFIER)/Alphabet.cpp @am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT Alphabet.o -MD -MP -MF $(DEPDIR)/Alphabet.Tpo -c -o Alphabet.o `test -f '$(CLASSIFIER)/Alphabet.cpp' || echo '$(srcdir)/'`$(CLASSIFIER)/Alphabet.cpp diff --git a/src/tagger/SequenceDecoder.cpp b/src/sequence/SequenceDecoder.cpp similarity index 99% rename from src/tagger/SequenceDecoder.cpp rename to src/sequence/SequenceDecoder.cpp index bcb6fda..17563df 100644 --- a/src/tagger/SequenceDecoder.cpp +++ b/src/sequence/SequenceDecoder.cpp @@ -383,8 +383,8 @@ void SequenceDecoder::ConvertToFirstOrderModel( for (int l = 0; l < node_scores[i+2].size(); ++l, ++t) { // Tag l at position i+2. CHECK_LT(t, (*transformed_edge_scores)[i].size()); - CHECK_LT(j, (*transformed_edge_scores)[i][t].size()); - (*transformed_edge_scores)[i][t][j] = + CHECK_LT(j, (*transformed_edge_scores)[i][t].size()); + (*transformed_edge_scores)[i][t][j] = std::pair(s, triplet_scores[i][j][k][l]); } } @@ -489,7 +489,7 @@ double SequenceDecoder::RunViterbi(const vector > &node_scores, int num_current_labels = node_scores[i+1].size(); deltas[i + 1].resize(num_current_labels); backtrack[i + 1].resize(num_current_labels); - for (int k = 0; k < num_current_labels; ++k) { + for (int k = 0; k < num_current_labels; ++k) { double best_value = -1e-12; int best = -1; // Edges from the previous position. diff --git a/src/tagger/SequenceDecoder.h b/src/sequence/SequenceDecoder.h similarity index 100% rename from src/tagger/SequenceDecoder.h rename to src/sequence/SequenceDecoder.h diff --git a/src/sequence/SequenceDictionary.cpp b/src/sequence/SequenceDictionary.cpp new file mode 100644 index 0000000..ceb2bc2 --- /dev/null +++ b/src/sequence/SequenceDictionary.cpp @@ -0,0 +1,52 @@ +// Copyright (c) 2012-2013 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.1. +// +// TurboParser 2.1 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.1 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.1. If not, see . + +#include "SequenceDictionary.h" +#include "SequencePipe.h" +#include + +void SequenceDictionary::CreateTagDictionary(SequenceReader *reader) { + LOG(INFO) << "Creating tag dictionary..."; + vector tag_freqs; + + // Go through the corpus and build the label dictionary, + // counting the frequencies. + reader->Open(pipe_->GetOptions()->GetTrainingFilePath()); + SequenceInstance *instance = + static_cast(reader->GetNext()); + while (instance != NULL) { + int instance_length = instance->size(); + for (int i = 0; i < instance_length; ++i) { + int id; + + // Add tag to alphabet. + id = tag_alphabet_.Insert(instance->GetTag(i)); + if (id >= tag_freqs.size()) { + CHECK_EQ(id, tag_freqs.size()); + tag_freqs.push_back(0); + } + ++tag_freqs[id]; + } + delete instance; + instance = static_cast(reader->GetNext()); + } + reader->Close(); + tag_alphabet_.StopGrowth(); + + LOG(INFO) << "Number of tags: " << tag_alphabet_.size(); +} diff --git a/src/sequence/SequenceDictionary.h b/src/sequence/SequenceDictionary.h new file mode 100644 index 0000000..2962b86 --- /dev/null +++ b/src/sequence/SequenceDictionary.h @@ -0,0 +1,92 @@ +// Copyright (c) 2012-2013 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.1. +// +// TurboParser 2.1 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.1 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.1. If not, see . + +#ifndef SEQUENCEDICTIONARY_H_ +#define SEQUENCEDICTIONARY_H_ + +#include "Dictionary.h" +#include "TokenDictionary.h" +#include "SerializationUtils.h" + +class Pipe; + +class SequenceDictionary : public Dictionary { + public: + SequenceDictionary() {} + SequenceDictionary(Pipe* pipe) : pipe_(pipe) {} + virtual ~SequenceDictionary() { Clear(); } + + virtual void Clear() { + // Don't clear token_dictionary, since this class does not own it. + tag_alphabet_.clear(); + } + + virtual void Save(FILE *fs) { + if (0 > tag_alphabet_.Save(fs)) CHECK(false); + } + + void Load(FILE *fs) { + if (0 > tag_alphabet_.Load(fs)) CHECK(false); + tag_alphabet_.BuildNames(); + } + + void AllowGrowth() { token_dictionary_->AllowGrowth(); } + void StopGrowth() { token_dictionary_->StopGrowth(); } + + virtual void CreateTagDictionary(SequenceReader *reader); + + void BuildTagNames() { + tag_alphabet_.BuildNames(); + } + + const string &GetTagName(int tag) const { + return tag_alphabet_.GetName(tag); + } + + int GetBigramLabel(int left_tag, int tag) { + CHECK_GE(left_tag, -1); + CHECK_GE(tag, -1); + //return (left_tag * tag_alphabet_.size() + tag); + return ((1 + left_tag) * (1 + tag_alphabet_.size()) + (1 + tag)); + } + + int GetTrigramLabel(int left_left_tag, int left_tag, int tag) { + CHECK_GE(left_left_tag, -1); + CHECK_GE(left_tag, -1); + CHECK_GE(tag, -1); + //return (left_tag * left_tag * tag_alphabet_.size() + + // left_tag * tag_alphabet_.size() + tag); + return ((1 + left_left_tag) * (1 + tag_alphabet_.size()) * + (1 + tag_alphabet_.size()) + + (1 + left_tag) * (1 + tag_alphabet_.size()) + (1 + tag)); + } + + TokenDictionary *GetTokenDictionary() const { return token_dictionary_; } + void SetTokenDictionary(TokenDictionary *token_dictionary) { + token_dictionary_ = token_dictionary; + } + + const Alphabet &GetTagAlphabet() const { return tag_alphabet_; }; + + protected: + Pipe *pipe_; + TokenDictionary *token_dictionary_; + Alphabet tag_alphabet_; +}; + +#endif /* SEQUENCEDICTIONARY_H_ */ diff --git a/src/tagger/SequenceFeatures.h b/src/sequence/SequenceFeatures.h similarity index 79% rename from src/tagger/SequenceFeatures.h rename to src/sequence/SequenceFeatures.h index f6c3793..dab8850 100644 --- a/src/tagger/SequenceFeatures.h +++ b/src/sequence/SequenceFeatures.h @@ -21,7 +21,6 @@ #include "Features.h" #include "SequenceInstanceNumeric.h" -#include "FeatureEncoder.h" class SequenceOptions; @@ -93,18 +92,28 @@ class SequenceFeatures: public Features { }; public: - void AddUnigramFeatures(SequenceInstanceNumeric *sentence, - int position); - - void AddBigramFeatures(SequenceInstanceNumeric *sentence, - int position); + virtual void AddUnigramFeatures(SequenceInstanceNumeric *sentence, + int position) { + // Add an empty feature vector. + CHECK(!input_features_unigrams_[position]); + BinaryFeatures *features = new BinaryFeatures; + input_features_unigrams_[position] = features; + } - void AddTrigramFeatures(SequenceInstanceNumeric *sentence, - int position); + virtual void AddBigramFeatures(SequenceInstanceNumeric *sentence, + int position) { + // Add an empty feature vector. + CHECK(!input_features_bigrams_[position]); + BinaryFeatures *features = new BinaryFeatures; + input_features_bigrams_[position] = features; + } - protected: - void AddFeature(uint64_t fkey, BinaryFeatures* features) { - features->push_back(fkey); + virtual void AddTrigramFeatures(SequenceInstanceNumeric *sentence, + int position) { + // Add an empty feature vector. + CHECK(!input_features_trigrams_[position]); + BinaryFeatures *features = new BinaryFeatures; + input_features_trigrams_[position] = features; } protected: @@ -112,7 +121,6 @@ class SequenceFeatures: public Features { vector input_features_unigrams_; vector input_features_bigrams_; vector input_features_trigrams_; - FeatureEncoder encoder_; // Encoder that converts features into a codeword. }; #endif /* SEQUENCEFEATURES_H_ */ diff --git a/src/tagger/SequenceInstance.cpp b/src/sequence/SequenceInstance.cpp similarity index 100% rename from src/tagger/SequenceInstance.cpp rename to src/sequence/SequenceInstance.cpp diff --git a/src/tagger/SequenceInstance.h b/src/sequence/SequenceInstance.h similarity index 100% rename from src/tagger/SequenceInstance.h rename to src/sequence/SequenceInstance.h diff --git a/src/tagger/SequenceInstanceNumeric.cpp b/src/sequence/SequenceInstanceNumeric.cpp similarity index 100% rename from src/tagger/SequenceInstanceNumeric.cpp rename to src/sequence/SequenceInstanceNumeric.cpp diff --git a/src/tagger/SequenceInstanceNumeric.h b/src/sequence/SequenceInstanceNumeric.h similarity index 100% rename from src/tagger/SequenceInstanceNumeric.h rename to src/sequence/SequenceInstanceNumeric.h diff --git a/src/sequence/SequenceOptions.cpp b/src/sequence/SequenceOptions.cpp new file mode 100644 index 0000000..b788b96 --- /dev/null +++ b/src/sequence/SequenceOptions.cpp @@ -0,0 +1,81 @@ +// Copyright (c) 2012-2013 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.1. +// +// TurboParser 2.1 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.1 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.1. If not, see . + +#include "SequenceOptions.h" +#include "SerializationUtils.h" +#include + +using namespace std; + +DEFINE_int32(sequence_model_type, 2, + "Model type. 1 is a bigram model, 2 is a trigram model."); +//DEFINE_bool(tagger_large_feature_set, false, +// "True for using a large feature set. Taggers are usually more " +// "accurate but slower and have a larger memory footprint."); +//DEFINE_bool(sequence_prune_tags, true, +// "True for pruning the set of possible tags by using a dictionary."); + +// Save current option flags to the model file. +void SequenceOptions::Save(FILE* fs) { + Options::Save(fs); + + bool success; + success = WriteInteger(fs, model_type_); + CHECK(success); + //success = WriteBool(fs, large_feature_set_); + //CHECK(success); + //success = WriteBool(fs, prune_tags_); + //CHECK(success); + + // TODO: Maybe we should load/save also the list of tags for unknown + // words? +} + +// Load current option flags to the model file. +// Note: this will override the user-specified flags. +void SequenceOptions::Load(FILE* fs) { + Options::Load(fs); + + bool success; + success = ReadInteger(fs, &FLAGS_sequence_model_type); + CHECK(success); + LOG(INFO) << "Setting --sequence_model_type=" << FLAGS_sequence_model_type; + //success = ReadBool(fs, &FLAGS_tagger_large_feature_set); + //CHECK(success); + //LOG(INFO) << "Setting --tagger_large_feature_set=" + // << FLAGS_tagger_large_feature_set; + //success = ReadBool(fs, &FLAGS_sequence_prune_tags); + //CHECK(success); + //LOG(INFO) << "Setting --sequence_prune_tags=" << FLAGS_sequence_prune_tags; + + // TODO: Maybe we should load/save also the list of tags for unknown + // words? + + Initialize(); +} + +void SequenceOptions::Initialize() { + Options::Initialize(); + + //file_format_ = FLAGS_tagger_file_format; + model_type_ = FLAGS_sequence_model_type; + //large_feature_set_ = FLAGS_tagger_large_feature_set; + //prune_tags_ = FLAGS_sequence_prune_tags; + //file_unknown_word_tags_ = FLAGS_file_unknown_word_tags; +} + diff --git a/src/tagger/SequenceOptions.h b/src/sequence/SequenceOptions.h similarity index 76% rename from src/tagger/SequenceOptions.h rename to src/sequence/SequenceOptions.h index f08697d..a822b30 100644 --- a/src/tagger/SequenceOptions.h +++ b/src/sequence/SequenceOptions.h @@ -27,26 +27,17 @@ class SequenceOptions : public Options { virtual ~SequenceOptions() {}; // Serialization functions. - void Load(FILE* fs); - void Save(FILE* fs); + virtual void Load(FILE* fs); + virtual void Save(FILE* fs); // Initialization: set options based on the flags. - void Initialize(); + virtual void Initialize(); // Get option flags. - bool large_feature_set() { return large_feature_set_; } - bool prune_tags() { return prune_tags_; } int markov_order() { return model_type_; } - const string &GetUnknownWordTagsFilePath() { - return file_unknown_word_tags_; - } protected: - string file_format_; int model_type_; - bool large_feature_set_; - bool prune_tags_; - string file_unknown_word_tags_; }; #endif // SEQUENCE_OPTIONS_H_ diff --git a/src/tagger/SequencePart.cpp b/src/sequence/SequencePart.cpp similarity index 100% rename from src/tagger/SequencePart.cpp rename to src/sequence/SequencePart.cpp diff --git a/src/tagger/SequencePart.h b/src/sequence/SequencePart.h similarity index 100% rename from src/tagger/SequencePart.h rename to src/sequence/SequencePart.h diff --git a/src/tagger/SequencePipe.cpp b/src/sequence/SequencePipe.cpp similarity index 98% rename from src/tagger/SequencePipe.cpp rename to src/sequence/SequencePipe.cpp index b34eea6..4447072 100644 --- a/src/tagger/SequencePipe.cpp +++ b/src/sequence/SequencePipe.cpp @@ -35,11 +35,11 @@ void SequencePipe::SaveModel(FILE* fs) { void SequencePipe::LoadModel(FILE* fs) { delete token_dictionary_; - CreateTokenDictionary(); + CreateTokenDictionary(); token_dictionary_->Load(fs); Pipe::LoadModel(fs); static_cast(dictionary_)-> - SetTokenDictionary(token_dictionary_); + SetTokenDictionary(token_dictionary_); } void SequencePipe::PreprocessData() { @@ -283,7 +283,6 @@ void SequencePipe::MakeUnigramParts(Instance *instance, SequenceOptions *sequence_options = GetSequenceOptions(); int sentence_length = sentence->size(); bool make_gold = (gold_outputs != NULL); - bool prune_tags = sequence_options->prune_tags(); vector all_tags; vector allowed_tags; @@ -295,14 +294,8 @@ void SequencePipe::MakeUnigramParts(Instance *instance, int num_parts_initial = sequence_parts->size(); for (int i = 0; i < sentence_length; ++i) { - if (prune_tags) { - int word_id = sentence->GetFormId(i); - allowed_tags = sequence_dictionary->GetWordTags(word_id); - // For unknown words, allow all the tags. - if (allowed_tags.empty()) { - allowed_tags = all_tags; - } - } else { + GetAllowedTags(instance, i, &allowed_tags); + if (allowed_tags.empty()) { allowed_tags = all_tags; } diff --git a/src/tagger/SequencePipe.h b/src/sequence/SequencePipe.h similarity index 88% rename from src/tagger/SequencePipe.h rename to src/sequence/SequencePipe.h index 22630e8..ea81b06 100644 --- a/src/tagger/SequencePipe.h +++ b/src/sequence/SequencePipe.h @@ -46,23 +46,23 @@ class SequencePipe : public Pipe { }; protected: - void CreateDictionary() { + virtual void CreateDictionary() { dictionary_ = new SequenceDictionary(this); GetSequenceDictionary()->SetTokenDictionary(token_dictionary_); } - void CreateReader() { reader_ = new SequenceReader; } - void CreateWriter() { writer_ = new SequenceWriter; } + virtual void CreateReader() { reader_ = new SequenceReader; } + virtual void CreateWriter() { writer_ = new SequenceWriter; } void CreateDecoder() { decoder_ = new SequenceDecoder(this); }; Parts *CreateParts() { return new SequenceParts; }; - Features *CreateFeatures() { return new SequenceFeatures(this); }; + virtual Features *CreateFeatures() { return new SequenceFeatures(this); }; void CreateTokenDictionary() { token_dictionary_ = new TokenDictionary(this); }; - void PreprocessData(); + virtual void PreprocessData(); - Instance *GetFormattedInstance(Instance *instance) { + virtual Instance *GetFormattedInstance(Instance *instance) { SequenceInstanceNumeric *instance_numeric = new SequenceInstanceNumeric; instance_numeric->Initialize(*GetSequenceDictionary(), @@ -71,8 +71,16 @@ class SequencePipe : public Pipe { } protected: - void SaveModel(FILE* fs); - void LoadModel(FILE* fs); + virtual void SaveModel(FILE* fs); + virtual void LoadModel(FILE* fs); + + // Return the allowed tags for the i-th word. An empty vector means that all + // tags are allowed. + virtual void GetAllowedTags(Instance *instance, int i, + vector *allowed_tags) { + // By default, allow all tags. + allowed_tags->clear(); + } void MakeParts(Instance *instance, Parts *parts, vector *gold_outputs); diff --git a/src/tagger/SequenceReader.cpp b/src/sequence/SequenceReader.cpp similarity index 100% rename from src/tagger/SequenceReader.cpp rename to src/sequence/SequenceReader.cpp diff --git a/src/tagger/SequenceReader.h b/src/sequence/SequenceReader.h similarity index 100% rename from src/tagger/SequenceReader.h rename to src/sequence/SequenceReader.h diff --git a/src/tagger/SequenceWriter.cpp b/src/sequence/SequenceWriter.cpp similarity index 100% rename from src/tagger/SequenceWriter.cpp rename to src/sequence/SequenceWriter.cpp diff --git a/src/tagger/SequenceWriter.h b/src/sequence/SequenceWriter.h similarity index 100% rename from src/tagger/SequenceWriter.h rename to src/sequence/SequenceWriter.h diff --git a/src/tagger/TokenDictionary.cpp b/src/sequence/TokenDictionary.cpp similarity index 100% rename from src/tagger/TokenDictionary.cpp rename to src/sequence/TokenDictionary.cpp diff --git a/src/tagger/TokenDictionary.h b/src/sequence/TokenDictionary.h similarity index 100% rename from src/tagger/TokenDictionary.h rename to src/sequence/TokenDictionary.h diff --git a/src/tagger/Makefile.am b/src/tagger/Makefile.am index a50dbe5..8a9619b 100644 --- a/src/tagger/Makefile.am +++ b/src/tagger/Makefile.am @@ -1,16 +1,25 @@ UTIL = ../util CLASSIFIER = ../classifier +SEQUENCE = ../sequence PARSER = ../parser TurboTaggerprgdir = ../.. TurboTaggerprg_PROGRAMS = TurboTagger -TurboTagger_SOURCES = SequenceFeatures.cpp SequenceInstanceNumeric.cpp \ -SequencePart.h SequenceWriter.cpp SequenceDecoder.cpp SequenceFeatures.h \ -SequenceInstanceNumeric.h SequencePipe.cpp SequenceWriter.h SequenceDecoder.h \ -SequenceFeatureTemplates.h SequenceOptions.cpp SequencePipe.h \ -TokenDictionary.cpp SequenceDictionary.cpp SequenceInstance.cpp \ -SequenceOptions.h SequenceReader.cpp TokenDictionary.h SequenceDictionary.h \ -SequenceInstance.h SequencePart.cpp SequenceReader.h TurboTagger.cpp \ +TurboTagger_SOURCES = TaggerFeatures.cpp TaggerFeatures.h \ +TaggerFeatureTemplates.h TaggerOptions.cpp TaggerOptions.h \ +TaggerDictionary.cpp TaggerDictionary.h \ +TaggerPipe.cpp TaggerPipe.h \ +$(SEQUENCE)/SequenceInstanceNumeric.cpp \ +$(SEQUENCE)/SequencePart.h $(SEQUENCE)/SequenceWriter.cpp \ +$(SEQUENCE)/SequenceDecoder.cpp $(SEQUENCE)/SequenceFeatures.h \ +$(SEQUENCE)/SequenceInstanceNumeric.h $(SEQUENCE)/SequencePipe.cpp \ +$(SEQUENCE)/SequenceWriter.h $(SEQUENCE)/SequenceDecoder.h \ +$(SEQUENCE)/SequenceOptions.cpp $(SEQUENCE)/SequencePipe.h \ +$(SEQUENCE)/TokenDictionary.cpp $(SEQUENCE)/SequenceDictionary.cpp \ +$(SEQUENCE)/SequenceInstance.cpp $(SEQUENCE)/SequenceOptions.h \ +$(SEQUENCE)/SequenceReader.cpp $(SEQUENCE)/SequenceDictionary.h \ +$(SEQUENCE)/SequenceInstance.h $(SEQUENCE)/SequencePart.cpp \ +$(SEQUENCE)/SequenceReader.h $(SEQUENCE)/TokenDictionary.h \ $(CLASSIFIER)/Alphabet.cpp $(CLASSIFIER)/Dictionary.cpp \ $(CLASSIFIER)/Features.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Part.h \ $(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/SparseParameterVector.h \ @@ -23,8 +32,9 @@ $(CLASSIFIER)/SparseLabeledParameterVector.h $(CLASSIFIER)/Writer.h \ $(UTIL)/AlgUtils.cpp $(UTIL)/logval.h $(UTIL)/SerializationUtils.h \ $(UTIL)/StringUtils.h $(UTIL)/TimeUtils.h $(UTIL)/AlgUtils.h \ $(UTIL)/SerializationUtils.cpp $(UTIL)/StringUtils.cpp $(UTIL)/TimeUtils.cpp \ -$(UTIL)/Utils.h +$(UTIL)/Utils.h \ +TurboTagger.cpp -AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(PARSER) $(CPPFLAGS) +AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(SEQUENCE) -I$(PARSER) $(CPPFLAGS) LDADD = $(LFLAGS) diff --git a/src/tagger/Makefile.in b/src/tagger/Makefile.in index acd1cdb..d9fb032 100644 --- a/src/tagger/Makefile.in +++ b/src/tagger/Makefile.in @@ -45,17 +45,18 @@ CONFIG_CLEAN_FILES = CONFIG_CLEAN_VPATH_FILES = am__installdirs = "$(DESTDIR)$(TurboTaggerprgdir)" PROGRAMS = $(TurboTaggerprg_PROGRAMS) -am_TurboTagger_OBJECTS = SequenceFeatures.$(OBJEXT) \ - SequenceInstanceNumeric.$(OBJEXT) SequenceWriter.$(OBJEXT) \ - SequenceDecoder.$(OBJEXT) SequencePipe.$(OBJEXT) \ - SequenceOptions.$(OBJEXT) TokenDictionary.$(OBJEXT) \ - SequenceDictionary.$(OBJEXT) SequenceInstance.$(OBJEXT) \ - SequenceReader.$(OBJEXT) SequencePart.$(OBJEXT) \ - TurboTagger.$(OBJEXT) Alphabet.$(OBJEXT) Dictionary.$(OBJEXT) \ +am_TurboTagger_OBJECTS = TaggerFeatures.$(OBJEXT) \ + TaggerOptions.$(OBJEXT) TaggerDictionary.$(OBJEXT) \ + TaggerPipe.$(OBJEXT) SequenceInstanceNumeric.$(OBJEXT) \ + SequenceWriter.$(OBJEXT) SequenceDecoder.$(OBJEXT) \ + SequencePipe.$(OBJEXT) SequenceOptions.$(OBJEXT) \ + TokenDictionary.$(OBJEXT) SequenceDictionary.$(OBJEXT) \ + SequenceInstance.$(OBJEXT) SequenceReader.$(OBJEXT) \ + SequencePart.$(OBJEXT) Alphabet.$(OBJEXT) Dictionary.$(OBJEXT) \ Reader.$(OBJEXT) Parameters.$(OBJEXT) Pipe.$(OBJEXT) \ Writer.$(OBJEXT) Options.$(OBJEXT) AlgUtils.$(OBJEXT) \ SerializationUtils.$(OBJEXT) StringUtils.$(OBJEXT) \ - TimeUtils.$(OBJEXT) + TimeUtils.$(OBJEXT) TurboTagger.$(OBJEXT) TurboTagger_OBJECTS = $(am_TurboTagger_OBJECTS) TurboTagger_LDADD = $(LDADD) am__DEPENDENCIES_1 = @@ -171,15 +172,24 @@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ UTIL = ../util CLASSIFIER = ../classifier +SEQUENCE = ../sequence PARSER = ../parser TurboTaggerprgdir = ../.. -TurboTagger_SOURCES = SequenceFeatures.cpp SequenceInstanceNumeric.cpp \ -SequencePart.h SequenceWriter.cpp SequenceDecoder.cpp SequenceFeatures.h \ -SequenceInstanceNumeric.h SequencePipe.cpp SequenceWriter.h SequenceDecoder.h \ -SequenceFeatureTemplates.h SequenceOptions.cpp SequencePipe.h \ -TokenDictionary.cpp SequenceDictionary.cpp SequenceInstance.cpp \ -SequenceOptions.h SequenceReader.cpp TokenDictionary.h SequenceDictionary.h \ -SequenceInstance.h SequencePart.cpp SequenceReader.h TurboTagger.cpp \ +TurboTagger_SOURCES = TaggerFeatures.cpp TaggerFeatures.h \ +TaggerFeatureTemplates.h TaggerOptions.cpp TaggerOptions.h \ +TaggerDictionary.cpp TaggerDictionary.h \ +TaggerPipe.cpp TaggerPipe.h \ +$(SEQUENCE)/SequenceInstanceNumeric.cpp \ +$(SEQUENCE)/SequencePart.h $(SEQUENCE)/SequenceWriter.cpp \ +$(SEQUENCE)/SequenceDecoder.cpp $(SEQUENCE)/SequenceFeatures.h \ +$(SEQUENCE)/SequenceInstanceNumeric.h $(SEQUENCE)/SequencePipe.cpp \ +$(SEQUENCE)/SequenceWriter.h $(SEQUENCE)/SequenceDecoder.h \ +$(SEQUENCE)/SequenceOptions.cpp $(SEQUENCE)/SequencePipe.h \ +$(SEQUENCE)/TokenDictionary.cpp $(SEQUENCE)/SequenceDictionary.cpp \ +$(SEQUENCE)/SequenceInstance.cpp $(SEQUENCE)/SequenceOptions.h \ +$(SEQUENCE)/SequenceReader.cpp $(SEQUENCE)/SequenceDictionary.h \ +$(SEQUENCE)/SequenceInstance.h $(SEQUENCE)/SequencePart.cpp \ +$(SEQUENCE)/SequenceReader.h $(SEQUENCE)/TokenDictionary.h \ $(CLASSIFIER)/Alphabet.cpp $(CLASSIFIER)/Dictionary.cpp \ $(CLASSIFIER)/Features.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Part.h \ $(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/SparseParameterVector.h \ @@ -192,9 +202,10 @@ $(CLASSIFIER)/SparseLabeledParameterVector.h $(CLASSIFIER)/Writer.h \ $(UTIL)/AlgUtils.cpp $(UTIL)/logval.h $(UTIL)/SerializationUtils.h \ $(UTIL)/StringUtils.h $(UTIL)/TimeUtils.h $(UTIL)/AlgUtils.h \ $(UTIL)/SerializationUtils.cpp $(UTIL)/StringUtils.cpp $(UTIL)/TimeUtils.cpp \ -$(UTIL)/Utils.h +$(UTIL)/Utils.h \ +TurboTagger.cpp -AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(PARSER) $(CPPFLAGS) +AM_CPPFLAGS = -I$(UTIL) -I$(CLASSIFIER) -I$(SEQUENCE) -I$(PARSER) $(CPPFLAGS) LDADD = $(LFLAGS) all: all-am @@ -286,7 +297,6 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Reader.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/SequenceDecoder.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/SequenceDictionary.Po@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/SequenceFeatures.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/SequenceInstance.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/SequenceInstanceNumeric.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/SequenceOptions.Po@am__quote@ @@ -296,6 +306,10 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/SequenceWriter.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/SerializationUtils.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/StringUtils.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/TaggerDictionary.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/TaggerFeatures.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/TaggerOptions.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/TaggerPipe.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/TimeUtils.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/TokenDictionary.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/TurboTagger.Po@am__quote@ @@ -315,6 +329,146 @@ distclean-compile: @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` +SequenceInstanceNumeric.o: $(SEQUENCE)/SequenceInstanceNumeric.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequenceInstanceNumeric.o -MD -MP -MF $(DEPDIR)/SequenceInstanceNumeric.Tpo -c -o SequenceInstanceNumeric.o `test -f '$(SEQUENCE)/SequenceInstanceNumeric.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequenceInstanceNumeric.cpp +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequenceInstanceNumeric.Tpo $(DEPDIR)/SequenceInstanceNumeric.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequenceInstanceNumeric.cpp' object='SequenceInstanceNumeric.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequenceInstanceNumeric.o `test -f '$(SEQUENCE)/SequenceInstanceNumeric.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequenceInstanceNumeric.cpp + +SequenceInstanceNumeric.obj: $(SEQUENCE)/SequenceInstanceNumeric.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequenceInstanceNumeric.obj -MD -MP -MF $(DEPDIR)/SequenceInstanceNumeric.Tpo -c -o SequenceInstanceNumeric.obj `if test -f '$(SEQUENCE)/SequenceInstanceNumeric.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequenceInstanceNumeric.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequenceInstanceNumeric.cpp'; fi` +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequenceInstanceNumeric.Tpo $(DEPDIR)/SequenceInstanceNumeric.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequenceInstanceNumeric.cpp' object='SequenceInstanceNumeric.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequenceInstanceNumeric.obj `if test -f '$(SEQUENCE)/SequenceInstanceNumeric.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequenceInstanceNumeric.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequenceInstanceNumeric.cpp'; fi` + +SequenceWriter.o: $(SEQUENCE)/SequenceWriter.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequenceWriter.o -MD -MP -MF $(DEPDIR)/SequenceWriter.Tpo -c -o SequenceWriter.o `test -f '$(SEQUENCE)/SequenceWriter.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequenceWriter.cpp +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequenceWriter.Tpo $(DEPDIR)/SequenceWriter.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequenceWriter.cpp' object='SequenceWriter.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequenceWriter.o `test -f '$(SEQUENCE)/SequenceWriter.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequenceWriter.cpp + +SequenceWriter.obj: $(SEQUENCE)/SequenceWriter.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequenceWriter.obj -MD -MP -MF $(DEPDIR)/SequenceWriter.Tpo -c -o SequenceWriter.obj `if test -f '$(SEQUENCE)/SequenceWriter.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequenceWriter.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequenceWriter.cpp'; fi` +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequenceWriter.Tpo $(DEPDIR)/SequenceWriter.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequenceWriter.cpp' object='SequenceWriter.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequenceWriter.obj `if test -f '$(SEQUENCE)/SequenceWriter.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequenceWriter.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequenceWriter.cpp'; fi` + +SequenceDecoder.o: $(SEQUENCE)/SequenceDecoder.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequenceDecoder.o -MD -MP -MF $(DEPDIR)/SequenceDecoder.Tpo -c -o SequenceDecoder.o `test -f '$(SEQUENCE)/SequenceDecoder.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequenceDecoder.cpp +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequenceDecoder.Tpo $(DEPDIR)/SequenceDecoder.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequenceDecoder.cpp' object='SequenceDecoder.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequenceDecoder.o `test -f '$(SEQUENCE)/SequenceDecoder.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequenceDecoder.cpp + +SequenceDecoder.obj: $(SEQUENCE)/SequenceDecoder.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequenceDecoder.obj -MD -MP -MF $(DEPDIR)/SequenceDecoder.Tpo -c -o SequenceDecoder.obj `if test -f '$(SEQUENCE)/SequenceDecoder.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequenceDecoder.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequenceDecoder.cpp'; fi` +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequenceDecoder.Tpo $(DEPDIR)/SequenceDecoder.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequenceDecoder.cpp' object='SequenceDecoder.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequenceDecoder.obj `if test -f '$(SEQUENCE)/SequenceDecoder.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequenceDecoder.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequenceDecoder.cpp'; fi` + +SequencePipe.o: $(SEQUENCE)/SequencePipe.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequencePipe.o -MD -MP -MF $(DEPDIR)/SequencePipe.Tpo -c -o SequencePipe.o `test -f '$(SEQUENCE)/SequencePipe.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequencePipe.cpp +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequencePipe.Tpo $(DEPDIR)/SequencePipe.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequencePipe.cpp' object='SequencePipe.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequencePipe.o `test -f '$(SEQUENCE)/SequencePipe.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequencePipe.cpp + +SequencePipe.obj: $(SEQUENCE)/SequencePipe.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequencePipe.obj -MD -MP -MF $(DEPDIR)/SequencePipe.Tpo -c -o SequencePipe.obj `if test -f '$(SEQUENCE)/SequencePipe.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequencePipe.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequencePipe.cpp'; fi` +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequencePipe.Tpo $(DEPDIR)/SequencePipe.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequencePipe.cpp' object='SequencePipe.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequencePipe.obj `if test -f '$(SEQUENCE)/SequencePipe.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequencePipe.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequencePipe.cpp'; fi` + +SequenceOptions.o: $(SEQUENCE)/SequenceOptions.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequenceOptions.o -MD -MP -MF $(DEPDIR)/SequenceOptions.Tpo -c -o SequenceOptions.o `test -f '$(SEQUENCE)/SequenceOptions.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequenceOptions.cpp +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequenceOptions.Tpo $(DEPDIR)/SequenceOptions.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequenceOptions.cpp' object='SequenceOptions.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequenceOptions.o `test -f '$(SEQUENCE)/SequenceOptions.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequenceOptions.cpp + +SequenceOptions.obj: $(SEQUENCE)/SequenceOptions.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequenceOptions.obj -MD -MP -MF $(DEPDIR)/SequenceOptions.Tpo -c -o SequenceOptions.obj `if test -f '$(SEQUENCE)/SequenceOptions.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequenceOptions.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequenceOptions.cpp'; fi` +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequenceOptions.Tpo $(DEPDIR)/SequenceOptions.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequenceOptions.cpp' object='SequenceOptions.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequenceOptions.obj `if test -f '$(SEQUENCE)/SequenceOptions.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequenceOptions.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequenceOptions.cpp'; fi` + +TokenDictionary.o: $(SEQUENCE)/TokenDictionary.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.o -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.o `test -f '$(SEQUENCE)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/TokenDictionary.cpp +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/TokenDictionary.Tpo $(DEPDIR)/TokenDictionary.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/TokenDictionary.cpp' object='TokenDictionary.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.o `test -f '$(SEQUENCE)/TokenDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/TokenDictionary.cpp + +TokenDictionary.obj: $(SEQUENCE)/TokenDictionary.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT TokenDictionary.obj -MD -MP -MF $(DEPDIR)/TokenDictionary.Tpo -c -o TokenDictionary.obj `if test -f '$(SEQUENCE)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/TokenDictionary.cpp'; fi` +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/TokenDictionary.Tpo $(DEPDIR)/TokenDictionary.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/TokenDictionary.cpp' object='TokenDictionary.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o TokenDictionary.obj `if test -f '$(SEQUENCE)/TokenDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/TokenDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/TokenDictionary.cpp'; fi` + +SequenceDictionary.o: $(SEQUENCE)/SequenceDictionary.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequenceDictionary.o -MD -MP -MF $(DEPDIR)/SequenceDictionary.Tpo -c -o SequenceDictionary.o `test -f '$(SEQUENCE)/SequenceDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequenceDictionary.cpp +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequenceDictionary.Tpo $(DEPDIR)/SequenceDictionary.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequenceDictionary.cpp' object='SequenceDictionary.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequenceDictionary.o `test -f '$(SEQUENCE)/SequenceDictionary.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequenceDictionary.cpp + +SequenceDictionary.obj: $(SEQUENCE)/SequenceDictionary.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequenceDictionary.obj -MD -MP -MF $(DEPDIR)/SequenceDictionary.Tpo -c -o SequenceDictionary.obj `if test -f '$(SEQUENCE)/SequenceDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequenceDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequenceDictionary.cpp'; fi` +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequenceDictionary.Tpo $(DEPDIR)/SequenceDictionary.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequenceDictionary.cpp' object='SequenceDictionary.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequenceDictionary.obj `if test -f '$(SEQUENCE)/SequenceDictionary.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequenceDictionary.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequenceDictionary.cpp'; fi` + +SequenceInstance.o: $(SEQUENCE)/SequenceInstance.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequenceInstance.o -MD -MP -MF $(DEPDIR)/SequenceInstance.Tpo -c -o SequenceInstance.o `test -f '$(SEQUENCE)/SequenceInstance.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequenceInstance.cpp +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequenceInstance.Tpo $(DEPDIR)/SequenceInstance.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequenceInstance.cpp' object='SequenceInstance.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequenceInstance.o `test -f '$(SEQUENCE)/SequenceInstance.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequenceInstance.cpp + +SequenceInstance.obj: $(SEQUENCE)/SequenceInstance.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequenceInstance.obj -MD -MP -MF $(DEPDIR)/SequenceInstance.Tpo -c -o SequenceInstance.obj `if test -f '$(SEQUENCE)/SequenceInstance.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequenceInstance.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequenceInstance.cpp'; fi` +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequenceInstance.Tpo $(DEPDIR)/SequenceInstance.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequenceInstance.cpp' object='SequenceInstance.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequenceInstance.obj `if test -f '$(SEQUENCE)/SequenceInstance.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequenceInstance.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequenceInstance.cpp'; fi` + +SequenceReader.o: $(SEQUENCE)/SequenceReader.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequenceReader.o -MD -MP -MF $(DEPDIR)/SequenceReader.Tpo -c -o SequenceReader.o `test -f '$(SEQUENCE)/SequenceReader.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequenceReader.cpp +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequenceReader.Tpo $(DEPDIR)/SequenceReader.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequenceReader.cpp' object='SequenceReader.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequenceReader.o `test -f '$(SEQUENCE)/SequenceReader.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequenceReader.cpp + +SequenceReader.obj: $(SEQUENCE)/SequenceReader.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequenceReader.obj -MD -MP -MF $(DEPDIR)/SequenceReader.Tpo -c -o SequenceReader.obj `if test -f '$(SEQUENCE)/SequenceReader.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequenceReader.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequenceReader.cpp'; fi` +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequenceReader.Tpo $(DEPDIR)/SequenceReader.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequenceReader.cpp' object='SequenceReader.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequenceReader.obj `if test -f '$(SEQUENCE)/SequenceReader.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequenceReader.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequenceReader.cpp'; fi` + +SequencePart.o: $(SEQUENCE)/SequencePart.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequencePart.o -MD -MP -MF $(DEPDIR)/SequencePart.Tpo -c -o SequencePart.o `test -f '$(SEQUENCE)/SequencePart.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequencePart.cpp +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequencePart.Tpo $(DEPDIR)/SequencePart.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequencePart.cpp' object='SequencePart.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequencePart.o `test -f '$(SEQUENCE)/SequencePart.cpp' || echo '$(srcdir)/'`$(SEQUENCE)/SequencePart.cpp + +SequencePart.obj: $(SEQUENCE)/SequencePart.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT SequencePart.obj -MD -MP -MF $(DEPDIR)/SequencePart.Tpo -c -o SequencePart.obj `if test -f '$(SEQUENCE)/SequencePart.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequencePart.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequencePart.cpp'; fi` +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/SequencePart.Tpo $(DEPDIR)/SequencePart.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$(SEQUENCE)/SequencePart.cpp' object='SequencePart.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o SequencePart.obj `if test -f '$(SEQUENCE)/SequencePart.cpp'; then $(CYGPATH_W) '$(SEQUENCE)/SequencePart.cpp'; else $(CYGPATH_W) '$(srcdir)/$(SEQUENCE)/SequencePart.cpp'; fi` + Alphabet.o: $(CLASSIFIER)/Alphabet.cpp @am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT Alphabet.o -MD -MP -MF $(DEPDIR)/Alphabet.Tpo -c -o Alphabet.o `test -f '$(CLASSIFIER)/Alphabet.cpp' || echo '$(srcdir)/'`$(CLASSIFIER)/Alphabet.cpp @am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/Alphabet.Tpo $(DEPDIR)/Alphabet.Po diff --git a/src/tagger/SequenceDictionary.cpp b/src/tagger/TaggerDictionary.cpp similarity index 71% rename from src/tagger/SequenceDictionary.cpp rename to src/tagger/TaggerDictionary.cpp index 77c4e95..5346e2d 100644 --- a/src/tagger/SequenceDictionary.cpp +++ b/src/tagger/TaggerDictionary.cpp @@ -16,47 +16,24 @@ // You should have received a copy of the GNU Lesser General Public License // along with TurboParser 2.1. If not, see . -#include "SequenceDictionary.h" -#include "SequencePipe.h" +#include "TaggerDictionary.h" +#include "TaggerOptions.h" +#include "TaggerPipe.h" #include -void SequenceDictionary::CreateTagDictionary(SequenceReader *reader) { - LOG(INFO) << "Creating tag dictionary..."; - bool form_case_sensitive = FLAGS_form_case_sensitive; - - vector tag_freqs; - - // Go through the corpus and build the label dictionary, - // counting the frequencies. - reader->Open(pipe_->GetOptions()->GetTrainingFilePath()); - SequenceInstance *instance = - static_cast(reader->GetNext()); - while (instance != NULL) { - int instance_length = instance->size(); - for (int i = 0; i < instance_length; ++i) { - int id; +void TaggerDictionary::CreateTagDictionary(SequenceReader *reader) { + SequenceDictionary::CreateTagDictionary(reader); - // Add tag to alphabet. - id = tag_alphabet_.Insert(instance->GetTag(i)); - if (id >= tag_freqs.size()) { - CHECK_EQ(id, tag_freqs.size()); - tag_freqs.push_back(0); - } - ++tag_freqs[id]; - } - delete instance; - instance = static_cast(reader->GetNext()); - } - reader->Close(); - tag_alphabet_.StopGrowth(); + LOG(INFO) << "Creating word-tag dictionary..."; + bool form_case_sensitive = FLAGS_form_case_sensitive; - // Go through the corpus and build the existing labels for each head-modifier - // POS pair. + // Go through the corpus and build the existing tags for each word. word_tags_.clear(); word_tags_.resize(token_dictionary_->GetNumForms()); reader->Open(pipe_->GetOptions()->GetTrainingFilePath()); - instance = static_cast(reader->GetNext()); + SequenceInstance *instance = + static_cast(reader->GetNext()); while (instance != NULL) { int instance_length = instance->size(); for (int i = 0; i < instance_length; ++i) { @@ -88,11 +65,9 @@ void SequenceDictionary::CreateTagDictionary(SequenceReader *reader) { } reader->Close(); - LOG(INFO) << "Number of tags: " << tag_alphabet_.size(); - // If there is a list of possible tags for the unknown words, load it. - SequenceOptions *options = - static_cast(pipe_->GetOptions()); + TaggerOptions *options = + static_cast(pipe_->GetOptions()); if (options->GetUnknownWordTagsFilePath().size() == 0) { for (int i = 0; i < tag_alphabet_.size(); ++i) { unknown_word_tags_.push_back(i); @@ -101,7 +76,7 @@ void SequenceDictionary::CreateTagDictionary(SequenceReader *reader) { LOG(INFO) << "Loading file with unknown word tags..."; std::ifstream is; is.open(options->GetUnknownWordTagsFilePath().c_str(), ifstream::in); - CHECK(is.good()) << "Could not open " + CHECK(is.good()) << "Could not open " << options->GetUnknownWordTagsFilePath() << "."; vector > sentence_fields; string line; diff --git a/src/tagger/SequenceDictionary.h b/src/tagger/TaggerDictionary.h similarity index 60% rename from src/tagger/SequenceDictionary.h rename to src/tagger/TaggerDictionary.h index c8f8ace..0b9e2c5 100644 --- a/src/tagger/SequenceDictionary.h +++ b/src/tagger/TaggerDictionary.h @@ -16,29 +16,24 @@ // You should have received a copy of the GNU Lesser General Public License // along with TurboParser 2.1. If not, see . -#ifndef SEQUENCEDICTIONARY_H_ -#define SEQUENCEDICTIONARY_H_ +#ifndef TAGGERDICTIONARY_H_ +#define TAGGERDICTIONARY_H_ -#include "Dictionary.h" -#include "TokenDictionary.h" -#include "SerializationUtils.h" +#include "SequenceDictionary.h" -class Pipe; - -class SequenceDictionary : public Dictionary { +class TaggerDictionary : public SequenceDictionary { public: - SequenceDictionary() {} - SequenceDictionary(Pipe* pipe) : pipe_(pipe) {} - virtual ~SequenceDictionary() { Clear(); } + TaggerDictionary() {} + TaggerDictionary(Pipe* pipe) : SequenceDictionary(pipe) {} + virtual ~TaggerDictionary() {} void Clear() { - // Don't clear token_dictionary, since this class does not own it. - tag_alphabet_.clear(); + SequenceDictionary::Clear(); word_tags_.clear(); } void Save(FILE *fs) { - if (0 > tag_alphabet_.Save(fs)) CHECK(false); + SequenceDictionary::Save(fs); bool success; int length = unknown_word_tags_.size(); success = WriteInteger(fs, length); @@ -65,7 +60,7 @@ class SequenceDictionary : public Dictionary { } void Load(FILE *fs) { - if (0 > tag_alphabet_.Load(fs)) CHECK(false); + SequenceDictionary::Load(fs); bool success; int length; success = ReadInteger(fs, &length); @@ -91,41 +86,10 @@ class SequenceDictionary : public Dictionary { word_tags_[i][j] = tag; } } - - tag_alphabet_.BuildNames(); } - void AllowGrowth() { token_dictionary_->AllowGrowth(); } - void StopGrowth() { token_dictionary_->StopGrowth(); } - void CreateTagDictionary(SequenceReader *reader); - void BuildTagNames() { - tag_alphabet_.BuildNames(); - } - - const string &GetTagName(int tag) const { - return tag_alphabet_.GetName(tag); - } - - int GetBigramLabel(int left_tag, int tag) { - CHECK_GE(left_tag, -1); - CHECK_GE(tag, -1); - //return (left_tag * tag_alphabet_.size() + tag); - return ((1 + left_tag) * (1 + tag_alphabet_.size()) + (1 + tag)); - } - - int GetTrigramLabel(int left_left_tag, int left_tag, int tag) { - CHECK_GE(left_left_tag, -1); - CHECK_GE(left_tag, -1); - CHECK_GE(tag, -1); - //return (left_tag * left_tag * tag_alphabet_.size() + - // left_tag * tag_alphabet_.size() + tag); - return ((1 + left_left_tag) * (1 + tag_alphabet_.size()) * - (1 + tag_alphabet_.size()) + - (1 + left_tag) * (1 + tag_alphabet_.size()) + (1 + tag)); - } - const vector &GetWordTags(int word) { // return word_tags_[word]; // TODO: Not sure is this should be done here... @@ -138,19 +102,9 @@ class SequenceDictionary : public Dictionary { } } - TokenDictionary *GetTokenDictionary() const { return token_dictionary_; } - void SetTokenDictionary(TokenDictionary *token_dictionary) { - token_dictionary_ = token_dictionary; - } - - const Alphabet &GetTagAlphabet() const { return tag_alphabet_; }; - protected: - Pipe *pipe_; - TokenDictionary *token_dictionary_; - Alphabet tag_alphabet_; vector > word_tags_; vector unknown_word_tags_; }; -#endif /* SEQUENCEDICTIONARY_H_ */ +#endif /* TAGGERDICTIONARY_H_ */ diff --git a/src/tagger/SequenceFeatureTemplates.h b/src/tagger/TaggerFeatureTemplates.h similarity index 82% rename from src/tagger/SequenceFeatureTemplates.h rename to src/tagger/TaggerFeatureTemplates.h index d33f37d..229288e 100644 --- a/src/tagger/SequenceFeatureTemplates.h +++ b/src/tagger/TaggerFeatureTemplates.h @@ -16,10 +16,10 @@ // You should have received a copy of the GNU Lesser General Public License // along with TurboParser 2.1. If not, see . -#ifndef SEQUENCEFEATURETEMPLATES_H_ -#define SEQUENCEFEATURETEMPLATES_H_ +#ifndef TAGGERFEATURETEMPLATES_H_ +#define TAGGERFEATURETEMPLATES_H_ -struct SequenceFeatureTemplateParts { +struct TaggerFeatureTemplateParts { enum types { UNIGRAM = 0, BIGRAM, @@ -27,9 +27,9 @@ struct SequenceFeatureTemplateParts { }; }; -struct SequenceFeatureTemplateUnigram { +struct TaggerFeatureTemplateUnigram { enum types { - BIAS = 0, /* bias */ + BIAS = 0, /* bias */ W, /* word */ pW, /* word on the left */ nW, /* word on the right */ @@ -42,16 +42,16 @@ struct SequenceFeatureTemplateUnigram { }; }; -struct SequenceFeatureTemplateBigram { +struct TaggerFeatureTemplateBigram { enum types { BIAS = 0, /* bias */ }; }; -struct SequenceFeatureTemplateTrigram { +struct TaggerFeatureTemplateTrigram { enum types { BIAS = 0, /* bias */ }; }; -#endif /* SEQUENCEFEATURETEMPLATES_H_ */ +#endif /* TAGGERFEATURETEMPLATES_H_ */ diff --git a/src/tagger/SequenceFeatures.cpp b/src/tagger/TaggerFeatures.cpp similarity index 66% rename from src/tagger/SequenceFeatures.cpp rename to src/tagger/TaggerFeatures.cpp index cd1efd6..4360297 100644 --- a/src/tagger/SequenceFeatures.cpp +++ b/src/tagger/TaggerFeatures.cpp @@ -16,13 +16,13 @@ // You should have received a copy of the GNU Lesser General Public License // along with TurboParser 2.1. If not, see . -#include "SequencePipe.h" -#include "SequenceFeatures.h" +#include "TaggerPipe.h" +#include "TaggerFeatures.h" #include "SequencePart.h" -#include "SequenceFeatureTemplates.h" +#include "TaggerFeatureTemplates.h" -void SequenceFeatures::AddUnigramFeatures(SequenceInstanceNumeric *sentence, - int position) { +void TaggerFeatures::AddUnigramFeatures(SequenceInstanceNumeric *sentence, + int position) { CHECK(!input_features_unigrams_[position]); BinaryFeatures *features = new BinaryFeatures; input_features_unigrams_[position] = features; @@ -67,74 +67,74 @@ void SequenceFeatures::AddUnigramFeatures(SequenceInstanceNumeric *sentence, uint64_t fkey; uint8_t flags = 0x0; - flags |= SequenceFeatureTemplateParts::UNIGRAM; + flags |= TaggerFeatureTemplateParts::UNIGRAM; // Maximum is 255 feature templates. - CHECK_LT(SequenceFeatureTemplateUnigram::COUNT, 256); + CHECK_LT(TaggerFeatureTemplateUnigram::COUNT, 256); // Bias feature. - fkey = encoder_.CreateFKey_NONE(SequenceFeatureTemplateUnigram::BIAS, flags); + fkey = encoder_.CreateFKey_NONE(TaggerFeatureTemplateUnigram::BIAS, flags); AddFeature(fkey, features); // Lexical features. - fkey = encoder_.CreateFKey_W(SequenceFeatureTemplateUnigram::W, flags, WID); + fkey = encoder_.CreateFKey_W(TaggerFeatureTemplateUnigram::W, flags, WID); AddFeature(fkey, features); - fkey = encoder_.CreateFKey_W(SequenceFeatureTemplateUnigram::pW, flags, pWID); + fkey = encoder_.CreateFKey_W(TaggerFeatureTemplateUnigram::pW, flags, pWID); AddFeature(fkey, features); - fkey = encoder_.CreateFKey_W(SequenceFeatureTemplateUnigram::nW, flags, nWID); + fkey = encoder_.CreateFKey_W(TaggerFeatureTemplateUnigram::nW, flags, nWID); AddFeature(fkey, features); - fkey = encoder_.CreateFKey_W(SequenceFeatureTemplateUnigram::ppW, flags, ppWID); + fkey = encoder_.CreateFKey_W(TaggerFeatureTemplateUnigram::ppW, flags, ppWID); AddFeature(fkey, features); - fkey = encoder_.CreateFKey_W(SequenceFeatureTemplateUnigram::nnW, flags, nnWID); + fkey = encoder_.CreateFKey_W(TaggerFeatureTemplateUnigram::nnW, flags, nnWID); AddFeature(fkey, features); // Prefix/Suffix features. for (int l = 0; l < AID.size(); ++l) { uint8_t flag_prefix_length = l; - fkey = encoder_.CreateFKey_WP(SequenceFeatureTemplateUnigram::A, flags, AID[l], flag_prefix_length); + fkey = encoder_.CreateFKey_WP(TaggerFeatureTemplateUnigram::A, flags, AID[l], flag_prefix_length); AddFeature(fkey, features); } for (int l = 0; l < ZID.size(); ++l) { uint8_t flag_suffix_length = l; - fkey = encoder_.CreateFKey_WP(SequenceFeatureTemplateUnigram::Z, flags, ZID[l], flag_suffix_length); + fkey = encoder_.CreateFKey_WP(TaggerFeatureTemplateUnigram::Z, flags, ZID[l], flag_suffix_length); AddFeature(fkey, features); } // Several flags. - fkey = encoder_.CreateFKey_P(SequenceFeatureTemplateUnigram::FLAG, flags, flag_digit); + fkey = encoder_.CreateFKey_P(TaggerFeatureTemplateUnigram::FLAG, flags, flag_digit); AddFeature(fkey, features); - fkey = encoder_.CreateFKey_P(SequenceFeatureTemplateUnigram::FLAG, flags, flag_upper); + fkey = encoder_.CreateFKey_P(TaggerFeatureTemplateUnigram::FLAG, flags, flag_upper); AddFeature(fkey, features); - fkey = encoder_.CreateFKey_P(SequenceFeatureTemplateUnigram::FLAG, flags, flag_hyphen); + fkey = encoder_.CreateFKey_P(TaggerFeatureTemplateUnigram::FLAG, flags, flag_hyphen); AddFeature(fkey, features); } -void SequenceFeatures::AddBigramFeatures(SequenceInstanceNumeric *sentence, - int position) { +void TaggerFeatures::AddBigramFeatures(SequenceInstanceNumeric *sentence, + int position) { CHECK(!input_features_bigrams_[position]) << position << " " << sentence->size(); BinaryFeatures *features = new BinaryFeatures; input_features_bigrams_[position] = features; uint64_t fkey; uint8_t flags = 0x0; - flags |= SequenceFeatureTemplateParts::BIGRAM; + flags |= TaggerFeatureTemplateParts::BIGRAM; // Bias feature. - fkey = encoder_.CreateFKey_NONE(SequenceFeatureTemplateBigram::BIAS, flags); + fkey = encoder_.CreateFKey_NONE(TaggerFeatureTemplateBigram::BIAS, flags); AddFeature(fkey, features); } -void SequenceFeatures::AddTrigramFeatures(SequenceInstanceNumeric *sentence, - int position) { +void TaggerFeatures::AddTrigramFeatures(SequenceInstanceNumeric *sentence, + int position) { CHECK(!input_features_trigrams_[position]) << position << " " << sentence->size(); BinaryFeatures *features = new BinaryFeatures; input_features_trigrams_[position] = features; uint64_t fkey; uint8_t flags = 0x0; - flags |= SequenceFeatureTemplateParts::TRIGRAM; + flags |= TaggerFeatureTemplateParts::TRIGRAM; // Bias feature. - fkey = encoder_.CreateFKey_NONE(SequenceFeatureTemplateTrigram::BIAS, flags); + fkey = encoder_.CreateFKey_NONE(TaggerFeatureTemplateTrigram::BIAS, flags); AddFeature(fkey, features); } diff --git a/src/tagger/TaggerFeatures.h b/src/tagger/TaggerFeatures.h new file mode 100644 index 0000000..968d419 --- /dev/null +++ b/src/tagger/TaggerFeatures.h @@ -0,0 +1,49 @@ +// Copyright (c) 2012-2013 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.1. +// +// TurboParser 2.1 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.1 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.1. If not, see . + +#ifndef TAGGERFEATURES_H_ +#define TAGGERFEATURES_H_ + +#include "SequenceFeatures.h" +#include "FeatureEncoder.h" + +class TaggerFeatures: public SequenceFeatures { + public: + TaggerFeatures(Pipe* pipe) : SequenceFeatures(pipe) {} + virtual ~TaggerFeatures() {}; + + public: + void AddUnigramFeatures(SequenceInstanceNumeric *sentence, + int position); + + void AddBigramFeatures(SequenceInstanceNumeric *sentence, + int position); + + void AddTrigramFeatures(SequenceInstanceNumeric *sentence, + int position); + + protected: + void AddFeature(uint64_t fkey, BinaryFeatures* features) { + features->push_back(fkey); + } + + protected: + FeatureEncoder encoder_; // Encoder that converts features into a codeword. +}; + +#endif /* TAGGERFEATURES_H_ */ diff --git a/src/tagger/SequenceOptions.cpp b/src/tagger/TaggerOptions.cpp similarity index 81% rename from src/tagger/SequenceOptions.cpp rename to src/tagger/TaggerOptions.cpp index 2f2cbe2..15e21a5 100644 --- a/src/tagger/SequenceOptions.cpp +++ b/src/tagger/TaggerOptions.cpp @@ -16,7 +16,7 @@ // You should have received a copy of the GNU Lesser General Public License // along with TurboParser 2.1. If not, see . -#include "SequenceOptions.h" +#include "TaggerOptions.h" #include "SerializationUtils.h" #include @@ -28,8 +28,8 @@ DEFINE_string(tagger_file_format, "conll", "the format used in CONLL-X, and ""text"" for tokenized" "sentences (one per line, with tokens separated " "by white-spaces."); -DEFINE_int32(tagger_model_type, 2, - "Model type. 1 is a bigram model, 2 is a trigram model."); +//DEFINE_int32(tagger_model_type, 2, +// "Model type. 1 is a bigram model, 2 is a trigram model."); DEFINE_bool(tagger_large_feature_set, false, "True for using a large feature set. Taggers are usually more " "accurate but slower and have a larger memory footprint."); @@ -40,12 +40,12 @@ DEFINE_string(file_unknown_word_tags, "", "to out-of-vocabulary words."); // Save current option flags to the model file. -void SequenceOptions::Save(FILE* fs) { - Options::Save(fs); +void TaggerOptions::Save(FILE* fs) { + SequenceOptions::Save(fs); bool success; - success = WriteInteger(fs, model_type_); - CHECK(success); + //success = WriteInteger(fs, model_type_); + //CHECK(success); success = WriteBool(fs, large_feature_set_); CHECK(success); success = WriteBool(fs, prune_tags_); @@ -57,13 +57,13 @@ void SequenceOptions::Save(FILE* fs) { // Load current option flags to the model file. // Note: this will override the user-specified flags. -void SequenceOptions::Load(FILE* fs) { - Options::Load(fs); +void TaggerOptions::Load(FILE* fs) { + SequenceOptions::Load(fs); bool success; - success = ReadInteger(fs, &FLAGS_tagger_model_type); - CHECK(success); - LOG(INFO) << "Setting --tagger_model_type=" << FLAGS_tagger_model_type; + //success = ReadInteger(fs, &FLAGS_tagger_model_type); + //CHECK(success); + //LOG(INFO) << "Setting --tagger_model_type=" << FLAGS_tagger_model_type; success = ReadBool(fs, &FLAGS_tagger_large_feature_set); CHECK(success); LOG(INFO) << "Setting --tagger_large_feature_set=" @@ -78,11 +78,11 @@ void SequenceOptions::Load(FILE* fs) { Initialize(); } -void SequenceOptions::Initialize() { - Options::Initialize(); +void TaggerOptions::Initialize() { + SequenceOptions::Initialize(); file_format_ = FLAGS_tagger_file_format; - model_type_ = FLAGS_tagger_model_type; + //model_type_ = FLAGS_tagger_model_type; large_feature_set_ = FLAGS_tagger_large_feature_set; prune_tags_ = FLAGS_tagger_prune_tags; file_unknown_word_tags_ = FLAGS_file_unknown_word_tags; diff --git a/src/tagger/TaggerOptions.h b/src/tagger/TaggerOptions.h new file mode 100644 index 0000000..423781c --- /dev/null +++ b/src/tagger/TaggerOptions.h @@ -0,0 +1,50 @@ +// Copyright (c) 2012-2013 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.1. +// +// TurboParser 2.1 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.1 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.1. If not, see . + +#ifndef TAGGER_OPTIONS_H_ +#define TAGGER_OPTIONS_H_ + +#include "SequenceOptions.h" + +class TaggerOptions : public SequenceOptions { + public: + TaggerOptions() {}; + virtual ~TaggerOptions() {}; + + // Serialization functions. + void Load(FILE* fs); + void Save(FILE* fs); + + // Initialization: set options based on the flags. + void Initialize(); + + // Get option flags. + bool prune_tags() { return prune_tags_; } + bool large_feature_set() { return large_feature_set_; } + const string &GetUnknownWordTagsFilePath() { + return file_unknown_word_tags_; + } + + protected: + bool prune_tags_; + string file_format_; + bool large_feature_set_; + string file_unknown_word_tags_; +}; + +#endif // TAGGER_OPTIONS_H_ diff --git a/src/tagger/TaggerPipe.cpp b/src/tagger/TaggerPipe.cpp new file mode 100644 index 0000000..cb3848e --- /dev/null +++ b/src/tagger/TaggerPipe.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2012-2013 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.1. +// +// TurboParser 2.1 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.1 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.1. If not, see . + +#include "TaggerPipe.h" +#include +#include +#include +#ifdef _WIN32 +#include +#else +#include +#endif + diff --git a/src/tagger/TaggerPipe.h b/src/tagger/TaggerPipe.h new file mode 100644 index 0000000..423cb3c --- /dev/null +++ b/src/tagger/TaggerPipe.h @@ -0,0 +1,164 @@ +// Copyright (c) 2012-2013 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.1. +// +// TurboParser 2.1 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.1 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// Along with TurboParser 2.1. If not, see . + +#ifndef TAGGERPIPE_H_ +#define TAGGERPIPE_H_ + +#include "SequencePipe.h" +#include "TaggerOptions.h" +//#include "SequenceReader.h" +#include "TaggerDictionary.h" +//#include "TokenDictionary.h" +//#include "SequenceInstanceNumeric.h" +//#include "SequenceWriter.h" +//#include "SequencePart.h" +#include "TaggerFeatures.h" +//#include "SequenceDecoder.h" + +class TaggerPipe : public SequencePipe { + public: + TaggerPipe(Options* options) : SequencePipe(options) {} + virtual ~TaggerPipe() {} + + //SequenceReader *GetSequenceReader() { + // return static_cast(reader_); + //}; + TaggerDictionary *GetTaggerDictionary() { + return static_cast(dictionary_); + }; + TaggerOptions *GetTaggerOptions() { + return static_cast(options_); + }; + + protected: + void CreateDictionary() { + dictionary_ = new TaggerDictionary(this); + GetSequenceDictionary()->SetTokenDictionary(token_dictionary_); + } + //void CreateReader() { reader_ = new SequenceReader; } + //void CreateWriter() { writer_ = new SequenceWriter; } + //void CreateDecoder() { decoder_ = new SequenceDecoder(this); }; + //Parts *CreateParts() { return new SequenceParts; }; + Features *CreateFeatures() { return new TaggerFeatures(this); }; + + //void PreprocessData(); + + //Instance *GetFormattedInstance(Instance *instance) { + // SequenceInstanceNumeric *instance_numeric = + // new SequenceInstanceNumeric; + // instance_numeric->Initialize(*GetSequenceDictionary(), + // static_cast(instance)); + // return instance_numeric; + //} + + protected: + //void SaveModel(FILE* fs); + //void LoadModel(FILE* fs); + + void GetAllowedTags(Instance *instance, int i, vector *allowed_tags) { + // Make word-tag dictionary pruning. + allowed_tags->clear(); + bool prune_tags = GetTaggerOptions()->prune_tags(); + if (!prune_tags) return; + + SequenceInstanceNumeric *sentence = + static_cast(instance); + TaggerDictionary *tagger_dictionary = GetTaggerDictionary(); + + int word_id = sentence->GetFormId(i); + *allowed_tags = tagger_dictionary->GetWordTags(word_id); + } + + //void MakeParts(Instance *instance, Parts *parts, + // vector *gold_outputs); + //void MakeUnigramParts(Instance *instance, Parts *parts, + // vector *gold_outputs); + //void MakeBigramParts(Instance *instance, Parts *parts, + // vector *gold_outputs); + //void MakeTrigramParts(Instance *instance, Parts *parts, + // vector *gold_outputs); + + //void MakeSelectedFeatures(Instance *instance, Parts *parts, + // const vector &selected_parts, Features *features); + + //void ComputeScores(Instance *instance, Parts *parts, Features *features, + // vector *scores); + + //void MakeFeatureDifference(Parts *parts, + // Features *features, + // const vector &gold_output, + // const vector &predicted_output, + // FeatureVector *difference); + + //void MakeGradientStep(Parts *parts, + // Features *features, + // double eta, + // int iteration, + // const vector &gold_output, + // const vector &predicted_output); + + //void LabelInstance(Parts *parts, const vector &output, + // Instance *instance); + + //virtual void BeginEvaluation() { + // num_tag_mistakes_ = 0; + // num_tokens_ = 0; + // gettimeofday(&start_clock_, NULL); + //} + //virtual void EvaluateInstance(Instance *instance, + // Instance *output_instance, + // Parts *parts, + // const vector &gold_outputs, + // const vector &predicted_outputs) { + // SequenceInstance *sequence_instance = + // static_cast(instance); + // SequenceParts *sequence_parts = static_cast(parts); + // for (int i = 0; i < sequence_instance->size(); ++i) { + // const vector& unigrams = sequence_parts->FindUnigramParts(i); + // for (int k = 0; k < unigrams.size(); ++k) { + // int r = unigrams[k]; + // if (!NEARLY_EQ_TOL(gold_outputs[r], predicted_outputs[r], 1e-6)) { + // ++num_tag_mistakes_; + // break; + // } + // } + // ++num_tokens_; + // } + //} + //virtual void EndEvaluation() { + // LOG(INFO) << "Tagging accuracy: " << + // static_cast(num_tokens_ - num_tag_mistakes_) / + // static_cast(num_tokens_); + // timeval end_clock; + // gettimeofday(&end_clock, NULL); + // double num_seconds = + // static_cast(diff_ms(end_clock,start_clock_)) / 1000.0; + // double tokens_per_second = static_cast(num_tokens_) / num_seconds; + // LOG(INFO) << "Tagging speed: " + // << tokens_per_second << " tokens per second."; + //} + + protected: + //TokenDictionary *token_dictionary_; + //int num_tag_mistakes_; + //int num_tokens_; + //timeval start_clock_; +}; + +#endif /* TAGGERPIPE_H_ */ + diff --git a/src/tagger/TurboTagger.cpp b/src/tagger/TurboTagger.cpp index d8db1cd..3eaaef0 100644 --- a/src/tagger/TurboTagger.cpp +++ b/src/tagger/TurboTagger.cpp @@ -8,7 +8,7 @@ #include #include #include "Utils.h" -#include "SequencePipe.h" +#include "TaggerPipe.h" //#include "StringUtils.h" using namespace std; @@ -43,10 +43,10 @@ void TrainTagger() { timeval start, end; gettimeofday(&start, NULL); - SequenceOptions *options = new SequenceOptions; + TaggerOptions *options = new TaggerOptions; options->Initialize(); - SequencePipe *pipe = new SequencePipe(options); + TaggerPipe *pipe = new TaggerPipe(options); pipe->Initialize(); pipe->Train(); pipe->SaveModelFile(); @@ -66,10 +66,10 @@ void TestTagger() { timeval start, end; gettimeofday(&start, NULL); - SequenceOptions *options = new SequenceOptions; + TaggerOptions *options = new TaggerOptions; options->Initialize(); - SequencePipe *pipe = new SequencePipe(options); + TaggerPipe *pipe = new TaggerPipe(options); pipe->Initialize(); pipe->LoadModelFile(); pipe->Run();