forked from dssg/data-portal-treemap
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataportal.py
73 lines (62 loc) · 2.81 KB
/
dataportal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
import time
from pattern.web import URL, DOM, plaintext, strip_between
from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT
from pattern.web import abs
from bs4 import BeautifulSoup
import urllib2
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0
## Paul Meinshausen
## Chicago Data Portal scraper
## Open (create) the csv file that the data will be written to.
output = open("chicagometadata3.csv", "wb")
writer = csv.writer(output)
## Open the selenium webdriver that will open the webpage and wait until
## the data is loaded on the page before scraping.
ff = webdriver.Firefox()
ff.implicitly_wait(30)
## The datasets in the portal are spread over several pages, this loop
## instructs the scraper to go through each page.
## The html for the pages just updates the page number, so the URL() code
## keeps track of p and uses it to specify the page number
for p in range(1, 12):
url = URL('https://data.cityofchicago.org/browse?limitTo=datasets&sortBy=oldest&utf8=%E2%9C%93&page=' + str(p))
dom = DOM(url.download(cached=True))
## The list of datasets is in a table, so this loop cycles through the row
## elements to scrape each dataset.
for i in dom.by_tag('tbody')[0:]:
for g in i.by_tag('tr')[0:]:
for h in g.by_tag('a.name')[0:]:
name = h.content
name = plaintext(name)
name = name.encode('ascii', 'ignore')
for j in g.by_class('category infoItem')[0:]:
category = j.content
category = plaintext(category)
category = category.encode('ascii', 'ignore')
if (g.by_class('tags infoItem')):
tag = g.by_class('tags infoItem')[0].content
tag = tag.encode('ascii', 'ignore')
else:
tag = " "
for k in g.by_class('visits')[0:]:
visits = k.content
visits = visits[0:-6]
visits = visits.encode('ascii', 'ignore')
for l in g.by_class('description')[0:]:
description = l.content
description = description.encode('ascii', 'ignore')
for link in g.by_tag('a')[1:2]:
links = abs(link.attributes.get('href', ''), base=url.redirect or url.string)
ff.get(links+"/about")
element = ff.find_element_by_class_name("row_count")
time.sleep(8)
element_text = element.text
## Write each row to the file
writer.writerow([name, category, tag, visits, description, links, element_text])
output.close()