Skip to content

Commit

Permalink
FIX Fixed bug computing the gazetteer information.
Browse files Browse the repository at this point in the history
  • Loading branch information
andre-martins committed Oct 12, 2014
1 parent eefbb32 commit 4f77016
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 15 deletions.
45 changes: 45 additions & 0 deletions scripts_ner/results.txt
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,48 @@ accuracy: 87.68%; precision: 89.35%; recall: 88.10%; FB1: 88.72
MISC: precision: 88.74%; recall: 83.73%; FB1: 86.16
ORG: precision: 87.46%; recall: 80.61%; FB1: 83.90
PER: precision: 88.27%; recall: 91.48%; FB1: 89.84

=======================================================================

BILOU with shapes and trigram features, constrained, with 1 gazetteer feature [at home].

English
-------
test:
accuracy: 86.60%; precision: 86.93%; recall: 87.02%; FB1: 86.98
LOC: precision: 88.58%; recall: 91.19%; FB1: 89.87
MISC: precision: 80.56%; recall: 77.92%; FB1: 79.22
ORG: precision: 82.92%; recall: 80.37%; FB1: 81.63
PER: precision: 91.75%; recall: 93.51%; FB1: 92.62

dev:
I1012 19:35:20.068308 8257 SequencePipe.h:150] Tagging speed: 9360.8 tokens per second.
accuracy: 87.74%; precision: 91.56%; recall: 91.06%; FB1: 91.31
LOC: precision: 93.75%; recall: 93.96%; FB1: 93.86
MISC: precision: 90.00%; recall: 84.92%; FB1: 87.39
ORG: precision: 86.98%; recall: 85.16%; FB1: 86.06
PER: precision: 93.32%; recall: 95.55%; FB1: 94.42


=======================================================================

BILOU with shapes and trigram features, constrained, with 5 gazetteer features [at home].

English
-------
test:
I1012 19:44:18.231323 8402 SequencePipe.h:150] Tagging speed: 9021.07 tokens per second.
accuracy: 86.62%; precision: 86.45%; recall: 86.07%; FB1: 86.26
LOC: precision: 88.30%; recall: 90.53%; FB1: 89.40
MISC: precision: 79.43%; recall: 75.93%; FB1: 77.64
ORG: precision: 81.67%; recall: 79.41%; FB1: 80.53
PER: precision: 92.13%; recall: 92.70%; FB1: 92.42

dev:
I1012 19:44:25.217227 8406 SequencePipe.h:150] Tagging speed: 8963.85 tokens per second.
accuracy: 87.67%; precision: 90.85%; recall: 90.04%; FB1: 90.44
LOC: precision: 93.88%; recall: 93.52%; FB1: 93.70
MISC: precision: 88.26%; recall: 82.32%; FB1: 85.19
ORG: precision: 85.47%; recall: 83.37%; FB1: 84.41
PER: precision: 92.81%; recall: 95.28%; FB1: 94.03

8 changes: 7 additions & 1 deletion scripts_ner/train_test_entity_recognizer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,13 @@ if [ "$language" == "english" ] || [ "$language" == "spanish" ]
then
files_test[0]=${path_data}/${language}_test.conll.ner
files_test[1]=${path_data}/${language}_dev.conll.ner
file_gazetteer=${path_data}/${language}.list
files_test[2]=${path_data}/${language}_train.conll.ner
file_gazetteer=${path_data}/${language}_all_gazetteers.txt

echo "Creating gazetteer file..."
python create_gazetteer_file.py ${path_data}/KnownLists $file_gazetteer
echo "Done."

else
files_test[0]=${path_data}/${language}_test.conll.ner
fi
Expand Down
2 changes: 1 addition & 1 deletion scripts_srl/evaluator
Submodule evaluator updated from 5a9318 to b43ebd
66 changes: 54 additions & 12 deletions src/entity_recognizer/EntityDictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,13 @@ void EntityDictionary::ReadGazetteerFiles() {
LOG(INFO) << "Loading gazetteer file "
<< options->file_gazetteer() << "...";
std::ifstream is;
std::string line;

// Do a first pass just to count the words and create the
// dictionaries.
is.open(options->file_gazetteer().c_str(), ifstream::in);
CHECK(is.good()) << "Could not open "
<< options->file_gazetteer() << ".";
std::string line;
if (is.is_open()) {
while (!is.eof()) {
getline(is, line);
Expand All @@ -157,26 +160,64 @@ void EntityDictionary::ReadGazetteerFiles() {
gazetteer_entity_tag_alphabet_.Insert("U-" + entity_type);
for (int k = 1; k < fields.size(); ++k) {
const std::string &word = fields[k];
int word_id = gazetteer_word_alphabet_.Insert(word);
if (gazetteer_word_entity_tags_.size() <= word_id) {
gazetteer_word_entity_tags_.resize(word_id+1);
}
gazetteer_word_alphabet_.Insert(word);
}
}
}
is.close();

// Now do the second pass to actually fill in the data.
gazetteer_word_entity_tags_.clear();
gazetteer_word_entity_tags_.resize(gazetteer_word_alphabet_.size());
is.open(options->file_gazetteer().c_str(), ifstream::in);
CHECK(is.good()) << "Could not open "
<< options->file_gazetteer() << ".";
if (is.is_open()) {
while (!is.eof()) {
getline(is, line);
if (line == "") continue; // Ignore blank lines.
std::vector<std::string> fields;
StringSplit(line, " \t", &fields); // Break on tabs or spaces.
if (fields.size() < 2) continue;
const std::string &entity_type = fields[0];
int entity_type_begin_id =
gazetteer_entity_tag_alphabet_.Lookup("B-" + entity_type);
int entity_type_inside_id =
gazetteer_entity_tag_alphabet_.Lookup("I-" + entity_type);
int entity_type_last_id =
gazetteer_entity_tag_alphabet_.Lookup("L-" + entity_type);
int entity_type_unique_id =
gazetteer_entity_tag_alphabet_.Lookup("U-" + entity_type);
for (int k = 1; k < fields.size(); ++k) {
const std::string &word = fields[k];
int word_id = gazetteer_word_alphabet_.Lookup(word);
CHECK_GE(word_id, 0);
CHECK_LT(word_id, gazetteer_word_entity_tags_.size());
int entity_type_id = -1;
if (fields.size() == 2) {
gazetteer_word_entity_tags_[word_id].
push_back(entity_type_unique_id);
entity_type_id = entity_type_unique_id;
} else if (k == 1) {
gazetteer_word_entity_tags_[word_id].
push_back(entity_type_begin_id);
entity_type_id = entity_type_begin_id;
} else if (k == fields.size() - 1) {
gazetteer_word_entity_tags_[word_id].
push_back(entity_type_last_id);
entity_type_id = entity_type_last_id;
} else {
entity_type_id = entity_type_inside_id;
}
int l = -1;
for (l = 0; l < gazetteer_word_entity_tags_[word_id].size();
++l) {
if (gazetteer_word_entity_tags_[word_id][l] == entity_type_id) {
break;
}
}
if (l == gazetteer_word_entity_tags_[word_id].size()) {
gazetteer_word_entity_tags_[word_id].
push_back(entity_type_inside_id);
push_back(entity_type_id);
}
}
}
}
is.close();
}

gazetteer_word_alphabet_.StopGrowth();
Expand All @@ -185,4 +226,5 @@ void EntityDictionary::ReadGazetteerFiles() {
<< gazetteer_word_alphabet_.size();
LOG(INFO) << "Number of gazetteer entity tags: "
<< gazetteer_entity_tag_alphabet_.size();

}
9 changes: 8 additions & 1 deletion src/entity_recognizer/EntityDictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,13 @@ class EntityDictionary : public SequenceDictionary {
}
}

gazetteer_word_alphabet_.StopGrowth();
gazetteer_entity_tag_alphabet_.StopGrowth();
LOG(INFO) << "Number of gazetteer words: "
<< gazetteer_word_alphabet_.size();
LOG(INFO) << "Number of gazetteer entity tags: "
<< gazetteer_entity_tag_alphabet_.size();

success = ReadInteger(fs, &length);
CHECK(success);
allowed_bigrams_.resize(length);
Expand All @@ -119,7 +126,7 @@ class EntityDictionary : public SequenceDictionary {
int id = gazetteer_word_alphabet_.Lookup(word);
if (id >= 0) {
gazetteer_ids->assign(gazetteer_word_entity_tags_[id].begin(),
gazetteer_word_entity_tags_[id].end());
gazetteer_word_entity_tags_[id].end());
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/entity_recognizer/EntityInstanceNumeric.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,7 @@ void EntityInstanceNumeric::Initialize(const EntityDictionary &dictionary,

dictionary.GetWordGazetteerIds(instance->GetForm(i),
&gazetteer_ids_[i]);
//LOG(INFO) << instance->GetForm(i) << ": " << gazetteer_ids_[i].size();

}
}

0 comments on commit 4f77016

Please sign in to comment.