-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathresult_sink.py
50 lines (48 loc) · 1.51 KB
/
result_sink.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import json
from base64 import b64decode
from hashlib import md5
import os
from os import system
import redis
import sys
import time
from shutil import move
'''super simple web crawl result consumer'''
outpath = sys.argv[1] if len(sys.argv) >= 2 else '/tmp/'
sample_sshot = True if len(sys.argv) >= 3 else False
tmppath = '/mnt/tmp/'
r = redis.StrictRedis(unix_socket_path='/var/run/redis.sock',db=0)
waits = 0
outputs = 0
while True:
today = time.strftime('%Y%m%d%H')
todayfn = '/doms.' + today + '.json'
todayfile = tmppath + todayfn
permfile = outpath + todayfn
with open(todayfile,'a') as f:
while today == time.strftime('%Y%m%d%H'):
res = r.brpop('resque:resultqueue',5)
if res:
outputs += 1
r.incr('resque:outputs',1)
try:
val = json.loads(res[1])
except:
continue
filekey = md5(val['url']).hexdigest()
if 'sshot' in val:
if (not sample_sshot) or (sample_sshot and filekey.find('00') == 0):
pngout_dir = outpath + '/sshots'
now = val.get('ts',time.time() * 1000)
with open('%s/%s-%s.png' % (pngout_dir,filekey,now),'w') as pngfd:
pngfd.write(b64decode(val['sshot']))
del val['sshot']
# ck: i have NO IDEA why this line is here.
val['dom'] = val['dom']
f.write(json.dumps(val)+'\n')
f.flush()
else:
waits += 1
sys.stdout.write('\rwaits: %d\tresults: %d'%(waits,outputs))
sys.stdout.flush()
move(todayfile,permfile)