-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDownloadRedditDataset.py
98 lines (91 loc) · 3.23 KB
/
DownloadRedditDataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
import sys
import json
import urlparse
import urllib2
def addTitlesToJson(dataDir):
if not dataDir.endswith('/'): dataDir += '/'
jsonDir = os.path.join(dataDir,'..','jsons')
for root,dirs,files in os.walk(dataDir):
dirname = root[len(dataDir):]
for f in files:
if f.endswith('.json') and not f.startswith('graph'):
with open(os.path.join(root,f),'r') as fp:
graph = json.load(fp)
if 'title' not in graph:
graph['title'] = dirname
with open(os.path.join(root, f), 'w') as fp:
json.dump(graph,fp)
with open(os.path.join(jsonDir, f), 'w') as fp:
json.dump(graph,fp)
def downloadImages(image_urls,image_names,out_dir):
print("Scanning URLs...")
# Create an output directory
try:
os.makedirs(out_dir)
except:
pass #print "Error creating directory",out_dir
imageNameList = {}
i = 0
count = 0
for j in range(len(image_urls)):
each0 = image_urls[j]
bn = urlparse.urlsplit(each0).path.split('/')[-1]
imgName = image_names[j]
out_path = os.path.join(out_dir, imgName)
shouldAdd = True
if not os.path.exists(out_path):
try:
print("Reading URL:",each0)
f = urllib2.urlopen(each0,timeout=10)
data = f.read()
f.close()
print(" Datasize:",len(data))
if len(data) > 600:
f = open(out_path,'wb')
f.write(data)
f.close()
i+=1
else:
shouldAdd = False
except KeyboardInterrupt:
raise
except:
print ("Error Fetching URL:",each0)
imageNameList[each[1]] = (None,None,None)
traceback.print_exc()
count += 1
def downloadFromJson(gfile,outDir):
imageURLList = []
imageNameList = []
with open(gfile,'r') as fp:
jsonGraph = json.load(fp)
for n in jsonGraph['nodes']:
filename = os.path.basename(n['file'])
url = n['URL']
imageURLList.append(url)
imageNameList.append(filename)
n['file'] = os.path.join(outDir,jsonGraph['title'],filename)
downloadImages(imageURLList,imageNameList,os.path.join(outDir,jsonGraph['title']))
with open(os.path.join(outDir,jsonGraph['title'],os.path.basename(gfile)),'w') as fp:
json.dump(jsonGraph,fp)
def downloadImagesforAllJsons(jsonDir,outDir):
for f in os.listdir(jsonDir):
file = os.path.join(jsonDir,f)
if os.path.isfile(file) and f.endswith('.json'):
downloadFromJson(os.path.join(jsonDir,f),outDir)
def usage():
print('Download a given reddit photoshop battle dataset from reference json files')
print('python2 DownloadRedditDataset.py <json folder> -outputDir <output folder>')
args = sys.argv[1:]
outputDir = '.'
jsonDir = None
while args:
a = args.pop(0)
if a == '-h':
usage()
elif a == '-outputDir':
outputDir = args.pop(0)
elif not jsonDir:
jsonDir = a
downloadImagesforAllJsons(jsonDir,outputDir)