ENH Load a gazetteer file for NER.

Cocophotos · Oct 11, 2014 · 38024d7 · 38024d7
1 parent efa8948
commit 38024d7
Show file tree

Hide file tree

Showing 10 changed files with 1,964 additions and 1,229 deletions.
diff --git a/Makefile.in b/Makefile.in
diff --git a/aclocal.m4 b/aclocal.m4
diff --git a/configure b/configure
diff --git a/scripts_ner/train_test_entity_recognizer.sh b/scripts_ner/train_test_entity_recognizer.sh
@@ -14,6 +14,7 @@ test=true
 model_type=2 # Second-order model (trigrams).
 form_cutoff=0 #1 # Word cutoff. Only words which occur more than these times won't be considered unknown.
 tagging_scheme=bilou # bio
+file_gazetteer= # Empty gazetteer file by default.
 suffix=entity_recognizer
 
 # Set path folders.
@@ -36,6 +37,7 @@ if [ "$language" == "english" ] || [ "$language" == "spanish" ]
 then
     files_test[0]=${path_data}/${language}_test.conll.ner
     files_test[1]=${path_data}/${language}_dev.conll.ner
+    file_gazetteer=${path_data}/${language}.list
 else
     files_test[0]=${path_data}/${language}_test.conll.ner
 fi
@@ -65,6 +67,7 @@ then
         --sequence_model_type=${model_type} \
         --form_cutoff=${form_cutoff} \
         --entity_tagging_scheme=${tagging_scheme} \
+        --entity_file_gazetteer=${file_gazetteer} \
         --logtostderr
 fi
 

diff --git a/scripts_srl/evaluator b/scripts_srl/evaluator
diff --git a/src/entity_recognizer/EntityDictionary.cpp b/src/entity_recognizer/EntityDictionary.cpp
@@ -121,74 +121,68 @@ void EntityDictionary::CreateTagDictionary(SequenceReader *reader) {
 
   LOG(INFO) << "Total allowed bigrams: " << num_allowed_bigrams;
 
-#if 0
-  LOG(INFO) << "Creating word-tag dictionary...";
-  bool form_case_sensitive = FLAGS_form_case_sensitive;
-
-  // Go through the corpus and build the existing tags for each word.
-  word_tags_.clear();
-  word_tags_.resize(token_dictionary_->GetNumForms());
-
-  reader->Open(pipe_->GetOptions()->GetTrainingFilePath());
-  SequenceInstance *instance =
-    static_cast<SequenceInstance*>(reader->GetNext());
-  while (instance != NULL) {
-    int instance_length = instance->size();
-    for (int i = 0; i < instance_length; ++i) {
-      int id;
-      string form = instance->GetForm(i);
-      if (!form_case_sensitive) {
-        transform(form.begin(), form.end(), form.begin(), ::tolower);
-      }
-      int word_id = token_dictionary_->GetFormId(form);
-      //CHECK_GE(word_id, 0);
-
-      id = tag_alphabet_.Lookup(instance->GetTag(i));
-      CHECK_GE(id, 0);
-
-      // Insert new tag in the set of word tags, if it is not there
-      // already. NOTE: this is inefficient, maybe we should be using a
-      // different data structure.
-      if (word_id >= 0) {
-        vector<int> &tags = word_tags_[word_id];
-        int j;
-        for (j = 0; j < tags.size(); ++j) {
-          if (tags[j] == id) break;
-        }
-        if (j == tags.size()) tags.push_back(id);
-      }
-    }
-    delete instance;
-    instance = static_cast<SequenceInstance*>(reader->GetNext());
-  }
-  reader->Close();
-
-  // If there is a list of possible tags for the unknown words, load it.
-  TaggerOptions *options =
-    static_cast<TaggerOptions*>(pipe_->GetOptions());
-  if (options->GetUnknownWordTagsFilePath().size() == 0) {
-    for (int i = 0; i < tag_alphabet_.size(); ++i) {
-      unknown_word_tags_.push_back(i);
-    }
-  } else {
-    LOG(INFO) << "Loading file with unknown word tags...";
+  ReadGazetteerFiles();
+}
+
+void EntityDictionary::ReadGazetteerFiles() {
+  EntityOptions *options =
+    static_cast<EntityOptions*>(pipe_->GetOptions());
+
+  gazetteer_word_alphabet_.AllowGrowth();
+  gazetteer_entity_tag_alphabet_.AllowGrowth();
+
+  if (options->file_gazetteer() != "") {
+    LOG(INFO) << "Loading gazetteer file "
+              << options->file_gazetteer() << "...";
     std::ifstream is;
-    is.open(options->GetUnknownWordTagsFilePath().c_str(), ifstream::in);
+    is.open(options->file_gazetteer().c_str(), ifstream::in);
     CHECK(is.good()) << "Could not open "
-                     << options->GetUnknownWordTagsFilePath() << ".";
-    vector<vector<string> > sentence_fields;
-    string line;
+                     << options->file_gazetteer() << ".";
+    std::string line;
     if (is.is_open()) {
       while (!is.eof()) {
         getline(is, line);
-        if (line.size() == 0) break;
-        int tagid = tag_alphabet_.Lookup(line);
-        CHECK(tagid >= 0) << "Tag " << line << " does not exist.";
-        unknown_word_tags_.push_back(tagid);
-        LOG(INFO) << "Unknown word tag: " << line;
+        if (line == "") continue; // Ignore blank lines.
+        std::vector<std::string> fields;
+        StringSplit(line, " \t", &fields); // Break on tabs or spaces.
+        if (fields.size() < 2) continue;
+        const std::string &entity_type = fields[0];
+        int entity_type_begin_id =
+          gazetteer_entity_tag_alphabet_.Insert("B-" + entity_type);
+        int entity_type_inside_id =
+          gazetteer_entity_tag_alphabet_.Insert("I-" + entity_type);
+        int entity_type_last_id =
+          gazetteer_entity_tag_alphabet_.Insert("L-" + entity_type);
+        int entity_type_unique_id =
+          gazetteer_entity_tag_alphabet_.Insert("U-" + entity_type);
+        for (int k = 1; k < fields.size(); ++k) {
+          const std::string &word = fields[k];
+          int word_id = gazetteer_word_alphabet_.Insert(word);
+          if (gazetteer_word_entity_tags_.size() <= word_id) {
+            gazetteer_word_entity_tags_.resize(word_id+1);
+          }
+          if (fields.size() == 2) {
+            gazetteer_word_entity_tags_[word_id].
+              push_back(entity_type_unique_id);
+          } else if (k == 1) {
+            gazetteer_word_entity_tags_[word_id].
+              push_back(entity_type_begin_id);
+          } else if (k == fields.size() - 1) {
+            gazetteer_word_entity_tags_[word_id].
+              push_back(entity_type_last_id);
+          } else {
+            gazetteer_word_entity_tags_[word_id].
+              push_back(entity_type_inside_id);
+          }
+        }
       }
     }
   }
-  LOG(INFO) << "Number of unknown word tags: " << unknown_word_tags_.size();
-#endif
+
+  gazetteer_word_alphabet_.StopGrowth();
+  gazetteer_entity_tag_alphabet_.StopGrowth();
+  LOG(INFO) << "Number of gazetteer words: "
+            << gazetteer_word_alphabet_.size();
+  LOG(INFO) << "Number of gazetteer entity tags: "
+            << gazetteer_entity_tag_alphabet_.size();
 }
diff --git a/src/entity_recognizer/EntityDictionary.h b/src/entity_recognizer/EntityDictionary.h
@@ -128,6 +128,8 @@ class EntityDictionary : public SequenceDictionary {
 
   void CreateTagDictionary(SequenceReader *reader);
 
+  void ReadGazetteerFiles();
+
   bool IsAllowedBigram(int left_tag, int tag) {
     CHECK_GE(left_tag, -1);
     CHECK_GE(tag, -1);
@@ -136,6 +138,9 @@ class EntityDictionary : public SequenceDictionary {
 
  protected:
   std::vector<std::vector<bool> > allowed_bigrams_;
+  Alphabet gazetteer_word_alphabet_;
+  Alphabet gazetteer_entity_tag_alphabet_;
+  std::vector<std::vector<int> > gazetteer_word_entity_tags_;
   //vector<vector<int> > word_tags_;
   //vector<int> unknown_word_tags_;
 };