-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtripleloss_labels_coco_to_hdf5_data.py
203 lines (183 loc) · 7.54 KB
/
tripleloss_labels_coco_to_hdf5_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/usr/bin/env python
# image fc7 labels to hdf
import numpy as np
import os
import random
random.seed(3)
import sys
from hdf5_npsequence_generator import SequenceGenerator, HDF5SequenceWriter
# UNK_IDENTIFIER is the word used to identify unknown words
UNK_IDENTIFIER = 'unk'
# start every sentence in a new array, pad if <max
MAX_FRAMES = 1
MAX_WORDS = 20
"""Filenames has file with image/frame paths for vidids
and sentences with video ids"""
class fc7SequenceGenerator(SequenceGenerator):
def __init__(self, filenames, vocab_filename, batch_num_streams=1, max_frames=MAX_FRAMES,
align=True, shuffle=True, pad=True, truncate=True):
self.max_frames = max_frames
self.vid_labels = {}
self.lines = []
num_empty_lines = 0
self.vid_poolfeats = {} # listofdict [{}]
self.image_ids = {}
for imgid_file, poolfeatfile, sentfile in filenames:
print 'Reading features from file: %s' % poolfeatfile
with open(imgid_file, 'r') as infd:
for image_ids in infd:
image_ids = image_ids.split('\t')
image_ids = image_ids[1]
self.image_ids = dict.fromkeys(image_ids, 1)
with open(poolfeatfile, 'rb') as poolfd:
# each line has the fc7 mean of 1 video
for line in poolfd:
line = line.strip()
id_feat = line.split(',')
img_id = id_feat[0]
line = ','.join(id_feat[1:])
if img_id in self.image_ids:
if img_id not in self.vid_poolfeats:
self.vid_poolfeats[img_id]=[]
self.vid_poolfeats[img_id].append(line)
# reset max_words based on maximum frames in the video
if os.path.exists(sentfile):
print 'Reading labels in: %s' % sentfile
with open(sentfile, 'r') as sentfd:
for line in sentfd:
line = line.strip()
id_sent = line.split('\t')
if len(id_sent)<2:
num_empty_lines += 1
continue
#labels = id_sent[1].split(',')
if id_sent[1] in self.image_ids:
self.vid_labels[id_sent[1]] = id_sent[0]
self.lines.append((id_sent[1], id_sent[0]))
print 'lines: %d ' % len(self.lines)
if num_empty_lines > 0:
print 'Warning: ignoring %d empty lines.' % num_empty_lines
self.line_index = 0
self.num_resets = 0
self.num_truncates = 0
self.num_pads = 0
self.num_outs = 0
self.frame_list = []
self.vocabulary = {}
self.vocabulary_inverted = []
# initialize vocabulary
self.init_vocabulary(vocab_filename)
SequenceGenerator.__init__(self)
self.batch_num_streams = batch_num_streams # needed in hdf5 to seq
# make the number of image/sentence pairs a multiple of the buffer size
# so each timestep of each batch is useful and we can align the images
if shuffle:
random.shuffle(self.lines)
self.pad = pad
self.truncate = truncate
def streams_exhausted(self):
return self.num_resets > 0
def init_vocabulary(self, vocab_filename):
print "Initializing the vocabulary."
if os.path.isfile(vocab_filename):
with open(vocab_filename, 'rb') as vocab_file:
self.init_vocab_from_file(vocab_file)
else:
print "Error: No vocab file!"
def init_vocab_from_file(self, vocab_filedes):
# initialize the vocabulary with the UNK word
self.vocabulary = {UNK_IDENTIFIER: 0}
self.vocabulary_inverted = [UNK_IDENTIFIER]
num_words_dataset = 0
for line in vocab_filedes.readlines():
split_line = line.split()
word = split_line[0]
# print word
#if unicode(word) == UNK_IDENTIFIER:
if word == UNK_IDENTIFIER:
continue
else:
assert word not in self.vocabulary
num_words_dataset += 1
self.vocabulary[word] = len(self.vocabulary_inverted)
self.vocabulary_inverted.append(word)
num_words_vocab = len(self.vocabulary.keys())
print ('Initialized vocabulary from file with %d unique words ' +
'(from %d total words in dataset).') % \
(num_words_vocab, num_words_dataset)
assert len(self.vocabulary_inverted) == num_words_vocab
def next_line(self):
num_lines = float(len(self.lines))
print "num_lines : %s" %num_lines
if self.line_index == 1 or self.line_index == num_lines or self.line_index % 10000 == 0:
print 'Processed %d/%d (%f%%) lines' % (self.line_index, num_lines,
100 * self.line_index / num_lines)
self.line_index += 1
if self.line_index == num_lines:
self.line_index = 0
self.num_resets += 1
def get_pad_value(self, stream_name):
return 0
"""label_list: line with "," separated list of positive labels
return: list of 0/1s dim:#lexical labels, 1 for +ve"""
def labels_to_values(self, label_list):
pos_labels = label_list.split(',')
label_arr = np.zeros((1, len(self.vocabulary)+1))
label_indices = [self.vocabulary[label]+1 for label in pos_labels]
label_arr[0, label_indices] = 1
return label_arr
def float_line_to_stream(self, line):
return map(float, line.split(','))
# we have pooled fc7 features already in the file
def get_streams(self):
vidid, line = self.lines[self.line_index]
assert vidid in self.vid_poolfeats
text_mean_fc7 = self.vid_poolfeats[vidid][0] # list inside dict
mean_fc7 = self.float_line_to_stream(text_mean_fc7)
self.vid_labels[vidid] = self.labels_to_values(line)
labels = self.vid_labels[vidid]
self.num_outs += 1
out = {}
out['img_only_fc7'] = np.array(mean_fc7).reshape(1, len(mean_fc7))
out['img_only_labels'] = labels
self.next_line()
return out
VIDEO_STREAMS = 1
BUFFER_SIZE = 32 # TEXT streams
BATCH_STREAM_LENGTH = 1000 # (21 * 50)
SETTING = 'data/coco2014'
# OUTPUT_BASIS_DIR = '{0}/hdf5/buffer_{1}_only8obj_label72k_{2}'.format(SETTING,
OUTPUT_BASIS_DIR = '{0}/hdf5/buffer_{1}_only8newobj_label72k_{2}'.format(SETTING,
VIDEO_STREAMS, MAX_FRAMES)
VOCAB = './surf_intersect_glove.txt'
OUTPUT_BASIS_DIR_PATTERN = '%s/%%s_batches' % OUTPUT_BASIS_DIR
POOLFEAT_FILE_PATTERN = 'data/coco2014/Imagenet_{0}_vgg_fc7.txt'
LABEL_FILE_PATTERN = 'data/coco2014/test_imagenet_images.txt'
IMAGEID_FILE_PATTERN='data/coco2014/test_imagenet_images.txt'
# IMAGEID_FILE_PATTERN='data/coco2014/coco_rm8objs_image_list_train2014.txt'
def preprocess_dataset(split_name, data_split_name, batch_stream_length, aligned=False):
filenames = [(IMAGEID_FILE_PATTERN,
POOLFEAT_FILE_PATTERN.format(data_split_name),
LABEL_FILE_PATTERN)]
vocab_filename = VOCAB
output_basis_path = OUTPUT_BASIS_DIR_PATTERN % split_name
aligned = True
fsg = fc7SequenceGenerator(filenames, vocab_filename, VIDEO_STREAMS,
max_frames=MAX_FRAMES, align=aligned, shuffle=True, pad=aligned,
truncate=aligned)
fsg.batch_stream_length = batch_stream_length
writer = HDF5SequenceWriter(fsg, output_dir=output_basis_path)
writer.write_to_exhaustion()
writer.write_filelists()
if not os.path.isfile(vocab_filename):
print "Vocabulary not found"
def process_splits():
DATASETS = [ # split_name, data_split_name, aligned
# ('valid', 'mytest', True),
('train', 'train', False),
# ('test', 'test', False),
]
for split_name, data_split_name, aligned in DATASETS:
preprocess_dataset(split_name, data_split_name, BATCH_STREAM_LENGTH,aligned)
if __name__ == "__main__":
process_splits()