-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcollect_data.py
120 lines (101 loc) · 3.84 KB
/
collect_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#
# Open addresses Common ETL Library
# Open addresses Collect Raw Data
#
#
# Version 1.0 (Python) in progress
# Author John Murray
# Licence MIT
#
# Purpose Script to Colect Bulk Open Data feeds
#
import ConfigParser
from HTMLParser import HTMLParser
import urllib
import urllib2
import zipfile
import MySQLdb
import fnmatch
import time
class LinkExtractor(HTMLParser):
def reset(self):
HTMLParser.reset(self)
self.links = []
def setPattern(self,url,mask,type):
self.mask = mask
self.type = type
if "/" in url:
self.base = url[0:url.rfind("/")] + "/"
else:
self.base = url + "/"
def handle_starttag(self, tag, attrs):
if tag == "a":
attrs = dict(attrs) # store attributes in object
if tag == "a" and "href" in attrs:
href = attrs["href"]
if "/" in href:
file = href[href.rfind("/")+1:].lower()
else:
file = href.lower()
# if href.lower().endswith("."+self.type) and (href.lower().maskswith(self.mask) or ("/"+self.mask) in href.lower()):
if file.endswith("."+self.type) and fnmatch.fnmatch(file,self.mask):
if not href.lower().startswith("http://") and not href.lower().startswith("https://"):
href = self.base + href
self.links.append(href)
def collectData(cur,url,filemask,filetype,fileget):
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(url)
html = response.read()
parser = LinkExtractor()
parser.setPattern(url,filemask,filetype)
parser.feed(html)
links = parser.links
files = []
latest = 0
for l in links:
response = urllib2.urlopen(l)
meta = response.info()
# modtime = time.strftime('%Y-%m-%d %H:%M:%S',time.strptime(meta['Last-Modified'],"%a, %d %b %Y %H:%M:%S %Z"))
modtime = time.strptime(meta['Last-Modified'],"%a, %d %b %Y %H:%M:%S %Z")
files.append([l,meta['Content-Length'],modtime])
if modtime > latest:
latest = modtime
for f in files:
file = f[0][f[0].rfind("/")+1:]
query = "SELECT * FROM `Files` WHERE `fileurl`='"+f[0]+"' AND `size`='"+f[1]+"' AND `modtime`='"+time.strftime('%Y-%m-%d %H:%M:%S',f[2])+"';"
cur.execute(query)
if cur.rowcount == 0:
if fileget == 'all' or (fileget == 'latest' and f[2] >= latest):
print "Downloading: "+f[0]
print urllib.urlretrieve(f[0], file)
query = "INSERT INTO `Files`(`fileurl`, `size`, `modtime`) VALUES ('"+f[0]+"','"+f[1]+"','"+time.strftime('%Y-%m-%d %H:%M:%S',f[2])+"')"
cur.execute(query)
dbConn.commit()
if filetype.lower() == 'zip':
print "Unzipping: "+file
with zipfile.ZipFile(file, "r") as z:
z.extractall()
else:
print "Not latest: "+f[0]
else:
print "Unchanged: "+f[0]
config = ConfigParser.ConfigParser()
config.read("oa_alpha_etl.cnf")
username = config.get('database', 'username')
password = config.get('database', 'password')
hostname = config.get('database', 'hostname')
database = config.get('database', 'database')
dbConn = MySQLdb.connect(host=hostname,user=username,passwd=password,db=database)
cur = dbConn.cursor()
sources = config.get('sources', 'sources').split(",")
for s in sources:
url = config.get(s, 'url')
filemask = config.get(s, 'filemask')
filetype = config.get(s, 'filetype')
if config.has_option(s, 'get'):
fileget = config.get(s, 'get')
else:
fileget = "all"
collectData(cur,url,filemask,filetype,fileget)
dbConn.close()