Skip to content

Commit

Permalink
ENH Load a gazetteer file for NER.
Browse files Browse the repository at this point in the history
  • Loading branch information
andre-martins committed Oct 11, 2014
1 parent efa8948 commit 38024d7
Show file tree
Hide file tree
Showing 10 changed files with 1,964 additions and 1,229 deletions.
326 changes: 191 additions & 135 deletions Makefile.in

Large diffs are not rendered by default.

686 changes: 432 additions & 254 deletions aclocal.m4

Large diffs are not rendered by default.

614 changes: 397 additions & 217 deletions configure

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions scripts_ner/train_test_entity_recognizer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ test=true
model_type=2 # Second-order model (trigrams).
form_cutoff=0 #1 # Word cutoff. Only words which occur more than these times won't be considered unknown.
tagging_scheme=bilou # bio
file_gazetteer= # Empty gazetteer file by default.
suffix=entity_recognizer

# Set path folders.
Expand All @@ -36,6 +37,7 @@ if [ "$language" == "english" ] || [ "$language" == "spanish" ]
then
files_test[0]=${path_data}/${language}_test.conll.ner
files_test[1]=${path_data}/${language}_dev.conll.ner
file_gazetteer=${path_data}/${language}.list
else
files_test[0]=${path_data}/${language}_test.conll.ner
fi
Expand Down Expand Up @@ -65,6 +67,7 @@ then
--sequence_model_type=${model_type} \
--form_cutoff=${form_cutoff} \
--entity_tagging_scheme=${tagging_scheme} \
--entity_file_gazetteer=${file_gazetteer} \
--logtostderr
fi

Expand Down
2 changes: 1 addition & 1 deletion scripts_srl/evaluator
Submodule evaluator updated from 5a9318 to b43ebd
118 changes: 56 additions & 62 deletions src/entity_recognizer/EntityDictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,74 +121,68 @@ void EntityDictionary::CreateTagDictionary(SequenceReader *reader) {

LOG(INFO) << "Total allowed bigrams: " << num_allowed_bigrams;

#if 0
LOG(INFO) << "Creating word-tag dictionary...";
bool form_case_sensitive = FLAGS_form_case_sensitive;

// Go through the corpus and build the existing tags for each word.
word_tags_.clear();
word_tags_.resize(token_dictionary_->GetNumForms());

reader->Open(pipe_->GetOptions()->GetTrainingFilePath());
SequenceInstance *instance =
static_cast<SequenceInstance*>(reader->GetNext());
while (instance != NULL) {
int instance_length = instance->size();
for (int i = 0; i < instance_length; ++i) {
int id;
string form = instance->GetForm(i);
if (!form_case_sensitive) {
transform(form.begin(), form.end(), form.begin(), ::tolower);
}
int word_id = token_dictionary_->GetFormId(form);
//CHECK_GE(word_id, 0);

id = tag_alphabet_.Lookup(instance->GetTag(i));
CHECK_GE(id, 0);

// Insert new tag in the set of word tags, if it is not there
// already. NOTE: this is inefficient, maybe we should be using a
// different data structure.
if (word_id >= 0) {
vector<int> &tags = word_tags_[word_id];
int j;
for (j = 0; j < tags.size(); ++j) {
if (tags[j] == id) break;
}
if (j == tags.size()) tags.push_back(id);
}
}
delete instance;
instance = static_cast<SequenceInstance*>(reader->GetNext());
}
reader->Close();

// If there is a list of possible tags for the unknown words, load it.
TaggerOptions *options =
static_cast<TaggerOptions*>(pipe_->GetOptions());
if (options->GetUnknownWordTagsFilePath().size() == 0) {
for (int i = 0; i < tag_alphabet_.size(); ++i) {
unknown_word_tags_.push_back(i);
}
} else {
LOG(INFO) << "Loading file with unknown word tags...";
ReadGazetteerFiles();
}

void EntityDictionary::ReadGazetteerFiles() {
EntityOptions *options =
static_cast<EntityOptions*>(pipe_->GetOptions());

gazetteer_word_alphabet_.AllowGrowth();
gazetteer_entity_tag_alphabet_.AllowGrowth();

if (options->file_gazetteer() != "") {
LOG(INFO) << "Loading gazetteer file "
<< options->file_gazetteer() << "...";
std::ifstream is;
is.open(options->GetUnknownWordTagsFilePath().c_str(), ifstream::in);
is.open(options->file_gazetteer().c_str(), ifstream::in);
CHECK(is.good()) << "Could not open "
<< options->GetUnknownWordTagsFilePath() << ".";
vector<vector<string> > sentence_fields;
string line;
<< options->file_gazetteer() << ".";
std::string line;
if (is.is_open()) {
while (!is.eof()) {
getline(is, line);
if (line.size() == 0) break;
int tagid = tag_alphabet_.Lookup(line);
CHECK(tagid >= 0) << "Tag " << line << " does not exist.";
unknown_word_tags_.push_back(tagid);
LOG(INFO) << "Unknown word tag: " << line;
if (line == "") continue; // Ignore blank lines.
std::vector<std::string> fields;
StringSplit(line, " \t", &fields); // Break on tabs or spaces.
if (fields.size() < 2) continue;
const std::string &entity_type = fields[0];
int entity_type_begin_id =
gazetteer_entity_tag_alphabet_.Insert("B-" + entity_type);
int entity_type_inside_id =
gazetteer_entity_tag_alphabet_.Insert("I-" + entity_type);
int entity_type_last_id =
gazetteer_entity_tag_alphabet_.Insert("L-" + entity_type);
int entity_type_unique_id =
gazetteer_entity_tag_alphabet_.Insert("U-" + entity_type);
for (int k = 1; k < fields.size(); ++k) {
const std::string &word = fields[k];
int word_id = gazetteer_word_alphabet_.Insert(word);
if (gazetteer_word_entity_tags_.size() <= word_id) {
gazetteer_word_entity_tags_.resize(word_id+1);
}
if (fields.size() == 2) {
gazetteer_word_entity_tags_[word_id].
push_back(entity_type_unique_id);
} else if (k == 1) {
gazetteer_word_entity_tags_[word_id].
push_back(entity_type_begin_id);
} else if (k == fields.size() - 1) {
gazetteer_word_entity_tags_[word_id].
push_back(entity_type_last_id);
} else {
gazetteer_word_entity_tags_[word_id].
push_back(entity_type_inside_id);
}
}
}
}
}
LOG(INFO) << "Number of unknown word tags: " << unknown_word_tags_.size();
#endif

gazetteer_word_alphabet_.StopGrowth();
gazetteer_entity_tag_alphabet_.StopGrowth();
LOG(INFO) << "Number of gazetteer words: "
<< gazetteer_word_alphabet_.size();
LOG(INFO) << "Number of gazetteer entity tags: "
<< gazetteer_entity_tag_alphabet_.size();
}
5 changes: 5 additions & 0 deletions src/entity_recognizer/EntityDictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ class EntityDictionary : public SequenceDictionary {

void CreateTagDictionary(SequenceReader *reader);

void ReadGazetteerFiles();

bool IsAllowedBigram(int left_tag, int tag) {
CHECK_GE(left_tag, -1);
CHECK_GE(tag, -1);
Expand All @@ -136,6 +138,9 @@ class EntityDictionary : public SequenceDictionary {

protected:
std::vector<std::vector<bool> > allowed_bigrams_;
Alphabet gazetteer_word_alphabet_;
Alphabet gazetteer_entity_tag_alphabet_;
std::vector<std::vector<int> > gazetteer_word_entity_tags_;
//vector<vector<int> > word_tags_;
//vector<int> unknown_word_tags_;
};
Expand Down
Loading

0 comments on commit 38024d7

Please sign in to comment.