-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathentities_extraction.py
52 lines (44 loc) · 1.91 KB
/
entities_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import os
import nltk
import pickle
from typing import List
from nltk.stem import WordNetLemmatizer
from load_annotations import load_captions
def main(captions: List[str], path: str) -> None:
# writing list file, i.e., [[[entity1, entity2,...], caption], ...]
lemmatizer = WordNetLemmatizer()
new_captions = []
for caption in captions:
detected_entities = []
pos_tags = nltk.pos_tag(nltk.word_tokenize(caption)) # [('woman': 'NN'), ...]
for entities_with_pos in pos_tags:
if entities_with_pos[1] == 'NN' or entities_with_pos[1] == 'NNS':
entity = lemmatizer.lemmatize(entities_with_pos[0].lower().strip())
detected_entities.append(entity)
detected_entities = list(set(detected_entities))
new_captions.append([detected_entities, caption])
with open(path, 'wb') as outfile:
pickle.dump(new_captions, outfile)
if __name__ == '__main__':
datasets = ['coco_captions', 'flickr30k_captions']
captions_path = [
'./annotations/coco/train_captions.json',
'./annotations/flickr30k/train_captions.json'
]
out_path = [
'./annotations/coco/coco_with_entities.pickle',
'./annotations/flickr30k/flickr30k_with_entities.pickle'
]
idx = 0 # only need to change here! 0 -> coco training data, 1 -> flickr30k training data
if os.path.exists(out_path[idx]):
print('Read!')
with open(out_path[idx], 'rb') as infile:
captions_with_entities = pickle.load(infile)
print(f'The length of datasets: {len(captions_with_entities)}')
captions_with_entities = captions_with_entities[:20]
for caption_with_entities in captions_with_entities:
print(caption_with_entities)
else:
print('Writing... ...')
captions = load_captions(datasets[idx], captions_path[idx])
main(captions, out_path[idx])