This repository has been archived by the owner on Feb 1, 2024. It is now read-only.
forked from starsnotinjars/Automatic-Website-Categorizers
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract_data.py
72 lines (60 loc) · 2.21 KB
/
extract_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# coding: utf8
import boto
from boto.s3.key import Key
from gzipstream import GzipStreamFile
import json
import warc
import csv
import requests
import zlib
def get_partial_warc_file(url):
"""
we use the incredible gzipstreamfile module because of limitations with
WARC Python module. Seriously, if the module won't have existed, this
task would have been impossible. Thanks a lot to the creator
https://github.com/commoncrawl/gzipstream
return: warc.WARCFile instance
"""
conn = boto.connect_s3(anon=True)
pds = conn.get_bucket('aws-publicdatasets')
# Start a connection to one of the WARC files
k = Key(pds)
k.key = url
wf = warc.WARCFile(fileobj=GzipStreamFile(k))
for num, record in enumerate(wf):
try:
print 'On Record {0}'.format(num)
payload = record.payload.read()
if payload[0] == "{":
r = json.loads(payload)
description = ''
title = r['Envelope']['Payload-Metadata']['HTTP-Response-Metadata']['HTML-Metadata']['Head']['Title'].encode('utf-8')
for x in r['Envelope']['Payload-Metadata']['HTTP-Response-Metadata']['HTML-Metadata']['Head']['Metas']:
if x['name'] == 'description':
description = x['content']
uri = r['Envelope']['WARC-Header-Metadata']['WARC-Target-URI']
with open('output.csv', 'ab') as f:
writer = csv.writer(f)
writer.writerow([title, description, uri])
except:
pass
def get_urls():
url = "https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-2016-07/wat.paths.gz"
try:
response = requests.get(url)
except Exception as e:
print(e)
data = zlib.decompress(response.content, zlib.MAX_WBITS|32)
return data.split("\n")
print("Enter Last Known URL from urls.csv or Leave Blank if want to start from 0")
inurl = str(raw_input())
urls = get_urls()
if inurl == '':
idx = 0
else:
idx = urls.index(inurl)
for url in urls[idx:]:
get_partial_warc_file(url=url)
with open('urls.csv', 'ab') as f:
writer = csv.writer(f)
writer.writerow([url])