-
Notifications
You must be signed in to change notification settings - Fork 11
/
precinct_url_scraper_2018_g.py
80 lines (73 loc) · 2.98 KB
/
precinct_url_scraper_2018_g.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import re
import requests
from bs4 import BeautifulSoup
from time import sleep
url_stub = 'https://electionresults.sos.ne.gov/'
file_name_pattern = '20181106__ne__general__{office}.xls'
election_types = ['SW', 'CG', 'LD']
office_slugs = {
'For Governor and Lt. Governor -': 'governor',
'For United States Senator - 6 Year Term -': 'senate',
'For Representative in Congress - 2 Year Term - District 01': 'house__1',
'For Representative in Congress - 2 Year Term - District 02': 'house__2',
'For Representative in Congress - 2 Year Term - District 03': 'house__3',
'For Secretary of State -': 'secretary_of_state',
'For Auditor of Public Accounts -': 'state_auditor',
'For State Treasurer -': 'state_treasurer',
'For Attorney General -': 'attorney_general',
'For Member of the Legislature': 'state_house__{district}'
}
for election_type in election_types:
# request the statewide results page
sw_url_pattern = (
url_stub +
'resultsSW.aspx?text=Race&type={election_type}&map=CTY&lValue=100&gValue=001'
)
sw_page_content = requests.get(sw_url_pattern.format(
election_type=election_type
)).content
# parse out the table rows
sw_page_soup = BeautifulSoup(sw_page_content, 'lxml')
idegex = re.compile(r'^MainContentxuwgResults_\d+$')
trs = sw_page_soup.find('table', id='ContentPlaceHolder1xuwgResults_main').find_all('tr')
# iterate over table rows
for tr in trs:
if not tr.find('h1'):
continue
# parse out the office and district from the race text
race = tr.find('h1').text.split(' - District')
office = race[0].strip()
print(office)
try:
district = race[1].strip()
except IndexError:
district = None
# find the office slug (if it's defined)
try:
office_slug = office_slugs[office].format(district=district)
except:
print(' Unknown office: {0}'.format(office))
if office_slug:
# continue only if one of the defined offices
print(' Getting precinct results for {0}...'.format(office))
# parse out the county results page url
cty_page_url = url_stub + tr.find(
'div',
class_='grid-countylevelresults'
).find('a')['href']
# request the county results page
cty_page_content = requests.get(cty_page_url).content
cty_page_soup = BeautifulSoup(cty_page_content, 'lxml')
# parse out the "Export Precinct Level" button url
export_url = url_stub + cty_page_soup.find(
'a',
id='ContentPlaceHolder1_hlnkExportPrec'
)['href']
export_url = export_url.replace('/../', '/')
with open(
file_name_pattern.format(office=office_slug),
'wb'
) as f:
f.write(requests.get(export_url).content)
sleep(2)
sleep(2)