-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathscrape_police_one.py
117 lines (96 loc) · 4.29 KB
/
scrape_police_one.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from STATES import STATES as state_dict
import csv, string, requests
from bs4 import BeautifulSoup
from pprint import pprint as pp
import pandas as pd
import time
#
# IMPLEMENT DATABASE WRITING
#
# Dictionary to be overwritten when scraping. Values not found will not change
police_dict = {"Country": "null", "Address 1": "null", "Address 2": "null", "City": "null", "State": "null",
"Zip Code": "null", "County": "null",
"Phone #": "null", "Fax #": "null", "Department Website": "null", "Type": "null",
"Population Served": "null", "Number of Officers": "null"}
class Scraper():
DOMAIN = 'http://www.policeone.com/{}'
SEARCH = 'law-enforcement-directory/{}-Agencies/page-{}/'
def __init__(self):
self.agencies = {state.replace(' ', '-'): [] for state in state_dict.values()}
self.search_next = True
def get_page_links(self, state, number):
search_link = Scraper.DOMAIN.format(Scraper.SEARCH.format(state, number))
req = requests.get(search_link)
if req.status_code != 200:
return None
soup = BeautifulSoup(req.text, 'html.parser')
# Check to see if the last page is the same new page
head = soup.find('head')
title = head.find('title').text
if 'Page' in title:
num = title.split()[-1]
if num == number - 1:
self.search_next = False
return 1
links = []
search_div = soup.find('div', {'id': 'search-results'})
search_table = search_div
search_rows = search_table.find_all('tr')[2:-2] # first 2 are headers, last 2 are footers
for row in search_rows[:-1]:
cols = row.find_all('td')
link = cols[0].find('a').attrs['href']
links.append(link)
return links
def get_agency_details(self, given_link):
return_info_dict = {}
link = Scraper.DOMAIN.format(given_link)
req = requests.get(link)
if req.status_code != 200:
return None
soup = BeautifulSoup(req.text, 'html.parser')
dep_name_div = soup.find('h1', {'class': 'dep-head-text'})
info = dep_name_div.text
try: # certain stations have a comma instead of a hyphen in their name
name = info.split('-', 1)[0]
police_dict['Name'] = name
except ValueError:
print("No hyphen")
name = info.split(',', 1)[0]
police_dict['Name'] = name
return_info_dict = police_dict # Make a copy since we only want to overwrite null values that this dept has info for.
dep_info_div = soup.find_all('div', {'class': 'dep-block-info'}) # should return 2 divs
for div in dep_info_div:
alltags = div.findAll(True)
elements = []
for tag in alltags:
if tag.name == 'p':
elements.append(tag.text)
for elem in elements:
field, data = elem.split(':', 1)
return_info_dict[field.strip()] = data.strip()
df = pd.DataFrame(return_info_dict, index=[1]) # df to append to csv
print(return_info_dict)
return df
def run(self, write_on=False):
for state in self.agencies.keys():
i = 1
self.search_next = True
while self.search_next:
links = self.get_page_links(state, i)
if links is None:
self.agencies[state].append('ERROR: PAGE {}'.format(i))
break
elif links == 1:
break
else:
for link in links:
agency_detail = self.get_agency_details(link)
with open('policeone.csv', 'a') as csvfile:
try:
agency_detail.to_csv('policeone.csv', mode='a',
header=False) # Appends a dataframe to csv ; each df is one police station
except AttributeError:
print("Returns a Nonetype Object")
i += 1
scraper = Scraper()
scraper.run()