-
Notifications
You must be signed in to change notification settings - Fork 9
/
KarpathySplit.py
54 lines (37 loc) · 1.35 KB
/
KarpathySplit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# coding: utf-8
# # Karpathy Split for MS-COCO Dataset
import json
from random import shuffle, seed
seed( 123 ) # Make it reproducible
num_val = 5000
num_test = 5000
val = json.load( open('./data/annotations/captions_val2014.json', 'r') )
train = json.load( open('./data/annotations/captions_train2014.json', 'r') )
# Merge together
imgs = val['images'] + train['images']
annots = val['annotations'] + train['annotations']
shuffle( imgs )
# Split into val, test, train
dataset = {}
dataset[ 'val' ] = imgs[ :num_val ]
dataset[ 'test' ] = imgs[ num_val: num_val + num_test ]
dataset[ 'train' ] = imgs[ num_val + num_test: ]
# Group by image ids
itoa = {}
for a in annots:
imgid = a['image_id']
if not imgid in itoa: itoa[imgid] = []
itoa[imgid].append(a)
json_data = {}
info = train['info']
licenses = train['licenses']
split = [ 'test', 'val', 'train' ]
for subset in split:
json_data[ subset ] = { 'type':'caption', 'info':info, 'licenses': licenses,
'images':[], 'annotations':[] }
for img in dataset[ subset ]:
img_id = img['id']
anns = itoa[ img_id ]
json_data[ subset ]['images'].append( img )
json_data[ subset ]['annotations'].extend( anns )
json.dump( json_data[ subset ], open( 'data/annotations/karpathy_split_' + subset + '.json', 'w' ) )