Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nashville scraper #317

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
114 changes: 114 additions & 0 deletions nashville/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# encoding=utf-8
from pupa.scrape import Jurisdiction, Organization
from .events import NashvilleEventScraper
from .people import NashvillePersonScraper
from .bills import NashvilleBillScraper
from .vote_events import NashvilleVoteEventScraper


class Nashville(Jurisdiction):
division_id = "ocd-division/country:us/state:tn/place:nashville"
classification = "government"
name = "Metropolitan Government of Nashville & Davidson County"
url = "https://www.nashville.gov/"

scrapers = {
"events": NashvilleEventScraper,
"people": NashvillePersonScraper,
"bills": NashvilleBillScraper,
# "vote_events": NashvilleVoteEventScraper,
}

# Technically these might be terms. They mirror the format being scrapped.
legislative_sessions = [
{"identifier":"2019-2023", "name":"2019 - 2023 Term", "start_date": "2019", "end_date": "2023"},
{"identifier":"2015-2019", "name":"2015 - 2019 Term", "start_date": "2015", "end_date": "2019"},
{"identifier":"2011-2015", "name":"2011 - 2015 Term", "start_date": "2015", "end_date": "2011"},
{"identifier":"2007-2011", "name":"2007 - 2011 Term", "start_date": "2007", "end_date": "2011"},
{"identifier":"2003-2007", "name":"2003 - 2007 Term", "start_date": "2003", "end_date": "2007"},
{"identifier":"1999-2003", "name":"1999 - 2003 Term", "start_date": "1999", "end_date": "2003"},
{"identifier":"1995-1999", "name":"1995 - 1999 Term", "start_date": "1995", "end_date": "1999"},
]

def get_organizations(self):
# Mayor's Office
mayors_office = Organization(
name="Mayor's Office", classification="executive")

mayors_office.add_post(label="Mayor", role="mayor")
# Copied from https://www.nashville.gov/Mayors-Office.aspx
mayors_office.add_post(label="Receptionist / Administrative Assistant",
role="staff", division_id=self.division_id)
mayors_office.add_post(label="Metro General Services Photographer",
role="staff", division_id=self.division_id)
mayors_office.add_post(label="Press Secretary",
role="staff", division_id=self.division_id)
mayors_office.add_post(
label="Communications Director", role="staff", division_id=self.division_id)
mayors_office.add_post(label="Director, Mayor’s Office of Transportation and Sustainability",
role="staff", division_id=self.division_id)
mayors_office.add_post(label="Special Assistant to the Mayor / Scheduler",
role="staff", division_id=self.division_id)
mayors_office.add_post(label="Director, Mayor’s Office of Housing",
role="staff", division_id=self.division_id)
mayors_office.add_post(
label="Deputy Resilience Officer", role="staff", division_id=self.division_id)
mayors_office.add_post(label="Senior Advisor, Workforce, Diversity and Inclusion",
role="staff", division_id=self.division_id)
mayors_office.add_post(label="Transportation & Sustainability Manager",
role="staff", division_id=self.division_id)
mayors_office.add_post(
label="Chief Strategy Officer", role="staff", division_id=self.division_id)
mayors_office.add_post(label="Manager, Small Business/Creative Economy",
role="staff", division_id=self.division_id)
mayors_office.add_post(label="Executive Director, The Barnes Fund",
role="staff", division_id=self.division_id)
mayors_office.add_post(label="Chief of Staff",
role="staff", division_id=self.division_id)
mayors_office.add_post(label="Director, Mayor’s Office of Neighborhoods and Community Engagement",
role="staff", division_id=self.division_id)
mayors_office.add_post(label="Director of Community Engagement",
role="staff", division_id=self.division_id)
mayors_office.add_post(label="Early Childhood Education Project Manager",
role="staff", division_id=self.division_id)
mayors_office.add_post(
label="Senior Advisor for Education", role="staff", division_id=self.division_id)
mayors_office.add_post(
label="Senior Legislative Advisor", role="staff", division_id=self.division_id)
mayors_office.add_post(
label="Chief Operating Officer", role="staff", division_id=self.division_id)
mayors_office.add_post(
label="Youth Policy Consultant", role="staff", division_id=self.division_id)
mayors_office.add_post(label="Executive Assistant to Chief Operating Officer",
role="staff", division_id=self.division_id)
mayors_office.add_post(label="Executive Assistant to the Mayor",
role="staff", division_id=self.division_id)
mayors_office.add_post(
label="Community Relations Liaison", role="staff", division_id=self.division_id)
mayors_office.add_post(label="Director, Constituent Response/hubNashville",
role="staff", division_id=self.division_id)
mayors_office.add_post(label="Director, Mayor’s Office of Economic and Community Development",
role="staff", division_id=self.division_id)
mayors_office.add_post(
label="Senior Legislative Advisor", role="staff", division_id=self.division_id)
mayors_office.add_post(label="Senior Advisor for Health and Wellness Policy",
role="staff", division_id=self.division_id)

yield mayors_office
# City Council
city_council = Organization(
name="Nashville Metropolitan Council", classification="legislature")
city_council.add_post(label="Vice Mayor",
role="vicemayor", division_id=self.division_id)
AT_LARGE_SEATS = 4
for at_large_seat in range(1, AT_LARGE_SEATS + 1):
city_council.add_post(label="At Large({})".format(
at_large_seat), role="councilmember", division_id=self.division_id)
DISTRICTS = 3
for district in range(1, DISTRICTS + 1):
city_council.add_post(label="District {}".format(
district), role="councilmember", division_id=self.division_id)

yield city_council
# @TODO Add committees
# https://www.nashville.gov/Metro-Council/Council-Committees.aspx
131 changes: 131 additions & 0 deletions nashville/bills.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
from os import path
from datetime import datetime
from collections import namedtuple
from pupa.scrape import Bill, Jurisdiction
from pupa.exceptions import DuplicateItemError
from .utils import NashvilleScraper


class NashvilleBillScraper(NashvilleScraper):
titles = []
def scrape(self):
yield from self.get_resolution_by_council_term()
# TODO: Add zoning amendments
# http://www.nashville.gov/Metro-Clerk/Council-Meeting-Schedules/Public-Hearing-Notice.aspx

def get_resolution_by_council_term(self):
base_url = 'http://www.nashville.gov/Metro-Clerk/Legislative/Resolutions.aspx'
doc = self.lxmlize(base_url)
dnn_name = self.get_dnn_name(doc)
data_id = 'dnn_ctr{}_HtmlModule_lblContent'.format(dnn_name)
council_term_urls = doc.xpath(
'//div[@id="{}"]/p/a[contains(@href, ".aspx")]/@href'.format(data_id))
for term_url in council_term_urls:
yield from self.get_resolutions(term_url)

def get_resolutions(self, url):
doc = self.lxmlize(url)
self.bill_session = self.get_session_year(url)

current_year = datetime.now().year
session_year = int(self.bill_session[0:4])
current_session = session_year + 4 >= current_year
if current_session:
(dnn_name, ) = doc.xpath(
'//div[contains(@class, "DnnModule-NV-LVMSLegislation-List")]/a/@name')
bill_elements = doc.xpath(
'//div[@id="dnn_ctr{}_LVMSLegislationList_pnlListWrap"]/p'.format(dnn_name))
yield from self.get_session_bills(bill_elements)
second_list = []
try:
(second_list, ) = doc.xpath(
'//div[contains(@class, "DnnModule-{}")]/following-sibling::div[1]'.format(dnn_name))
except ValueError:
pass

yield from self.get_session_bills(second_list)
elif session_year > 2011:
# Skipping non dot net bill pages
dnn_name = self.get_dnn_name(doc)
bill_elements = doc.xpath(
'//div[@id="dnn_ctr{}_HtmlModule_lblContent"]/p'.format(dnn_name))
yield from self.get_session_bills(bill_elements)

def get_session_year(self, url):
filename = path.basename(url)
return filename.split('.')[0]

def get_session_bills(self, bill_elements):

title_p = None
summary_p = None
for idx, bill in enumerate(bill_elements):
if idx % 2:
# It is odd so lets create a new bill
try:
(administrative_title, ) = title_p.xpath('./a/text()')
except ValueError:
# The first row might be the title and won't contain an anchor tag
continue
(*classification, identifier) = administrative_title.split(' ')
if identifier not in self.titles:
(link, ) = title_p.xpath('./a/@href')
(title, ) = bill.xpath('./text()')
self.titles.append(identifier)
self.current_bill_identifier = identifier
bill = Bill(identifier=identifier,
title=title,
classification='resolution',
legislative_session=self.bill_session,
from_organization={"name": "Nashville Metropolitan Council"})
bill = self.get_bill_detail(link, bill)
# We can't save a bill with no sources
bill.add_source(link)
yield bill

else:
title_p = bill

def get_bill_detail(self, url, bill):
bill_doc = self.lxmlize(url)
try:
dnn_name = self.get_dnn_name(bill_doc)
return self.dot_net_bill_detail(dnn_name, bill_doc, bill)
except ValueError:
# TODO: Handle non dot net bill pages
pass
return bill

def dot_net_bill_detail(self, dnn_name, bill_doc, bill):
legislative_div_id = 'dnn_ctr{}_LVMSLegislationDetails_pnlLegislationDetails'.format(dnn_name)
supporting_files = bill_doc.xpath('//div[@id="{}"]/descendant::*/a[contains(@href, "files")]/@href'.format(legislative_div_id))
bill = self.sort_files(supporting_files, bill)
try:
(vote_pdf, ) = bill_doc.xpath('//div[@id="{}"]/descendant::*/a[contains(@href, "roll-call-votes")]/@href'.format(legislative_div_id))
# TODO: Parse votes
bill.add_document_link(note='roll-call-vote', url=vote_pdf, media_type="application/pdf")
except ValueError:
# Vote PDF is not available
pass
return bill

def sort_files(self, supporting_files, bill):
for support_file in supporting_files:
filename = path.basename(support_file)
if filename == self.current_bill_identifier + '.pdf':
# This is the main bill file
# TODO: Add sponsors
bill.add_source(note='detail', url=support_file)

else:
try:
note = filename[:len(filename)-4].split('_')[1].lower()
bill.add_document_link(note=note if note else ' - ', url=support_file, media_type="application/pdf")
except:
# It is possible that the doc doesn't fit either format so let's add it as detail
bill.add_source(note='detail', url=support_file)

return bill



57 changes: 57 additions & 0 deletions nashville/events.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import datetime
import pytz
from pupa.scrape import Event

from .utils import NashvilleScraper


class NashvilleEventScraper(NashvilleScraper):

def scrape(self):
yield from self.scheduled_meetings()
# needs to be implemented
# yield from self.meeting_minutes_archive()

def scheduled_meetings(self):
base_url = 'http://www.nashville.gov/Metro-Council/Council-Events-Calendar.aspx'
doc = self.lxmlize(base_url)
event_anchor = doc.xpath('//ul[@class="eventswidget"]/descendant::*/a')
for anchor in event_anchor:
(event_link,) = anchor.xpath('./@href')
meeting_doc = self.lxmlize(event_link)
dnn_name = self.get_dnn_name(meeting_doc)
event_id = 'dnn_ctr{}_EventDetails_pnlEvent'.format(dnn_name)
(title, ) = meeting_doc.xpath('//div[@id="{}"]/h1/text()'.format(event_id))
date_time = meeting_doc.xpath('//div[@id="{}"]/p/em/text()'.format(event_id))

date_time = self.strip_string_array(date_time)
try:
start = datetime.datetime.strptime(date_time, '%m-%d-%Y %I:%M %p')
except ValueError:
start = datetime.datetime.strptime(date_time.replace('.', ''), '%m/%d/%Y %I:%M %p')
tz = pytz.timezone("US/Central")
start = tz.localize(start)
event_location_id = 'dnn_ctr{}_EventDetails_pnlLocation'.format(dnn_name)
try:
(location_name, *address) = meeting_doc.xpath('//div[@id="{}"]/p/text()'.format(event_location_id))
except ValueError:
location_name = ' - '
description = meeting_doc.xpath('//div[@id="{}"]/p[3]/text()'.format(event_id))
description = self.strip_string_array(description)
# @TODO: Marking everything as confirmed - need to learn how cancelled meetings are posted
status = 'confirmed'
# @TODO: Geocode *address
e = Event(
name=title,
start_date=start,
location_name=location_name,
description=description,
status=status
)
e.add_source(event_link)
yield e

def meeting_minutes_archive(self):
# Prior
# http://www.nashville.gov/Metro-Clerk/Legislative/Minutes.aspx
pass
Loading