-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#120 Stagecoach scraper script first pass
- Loading branch information
1 parent
4da1363
commit 13a72f1
Showing
3 changed files
with
305 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
[ | ||
{ | ||
"title": "Stagecoach Bluebird - Schedules and Fares", | ||
"owner": "Stagecoach", | ||
"pageURL": "https://www.stagecoachbus.com/open-data", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"licence": "UNKNOWN", | ||
"description": "Schedules and Fares data for the Stagecoach Bluebird region", | ||
"tags": [], | ||
"resources": [ | ||
{ | ||
"fileName": "Schedules (TXC 2.1)", | ||
"fileSize": "14221478", | ||
"fileSizeUnit": "B", | ||
"fileType": "ZIP", | ||
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-sblb-route-schedule-data-transxchange.zip", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"numRecords": null | ||
}, | ||
{ | ||
"fileName": "Schedules (TXC 2.4)", | ||
"fileSize": "18022731", | ||
"fileSizeUnit": "B", | ||
"fileType": "ZIP", | ||
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-sblb-route-schedule-data-transxchange_2_4.zip", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"numRecords": null | ||
}, | ||
{ | ||
"fileName": "Fares", | ||
"fileSize": "1659477", | ||
"fileSizeUnit": "B", | ||
"fileType": "ZIP", | ||
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-sblb-fares-data-netex.zip", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"numRecords": null | ||
} | ||
] | ||
}, | ||
{ | ||
"title": "Stagecoach East Scotland - Schedules and Fares", | ||
"owner": "Stagecoach", | ||
"pageURL": "https://www.stagecoachbus.com/open-data", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"licence": "UNKNOWN", | ||
"description": "Schedules and Fares data for the Stagecoach East Scotland region", | ||
"tags": [], | ||
"resources": [ | ||
{ | ||
"fileName": "Schedules (TXC 2.1)", | ||
"fileSize": "22545284", | ||
"fileSizeUnit": "B", | ||
"fileType": "ZIP", | ||
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-scfi-route-schedule-data-transxchange.zip", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"numRecords": null | ||
}, | ||
{ | ||
"fileName": "Schedules (TXC 2.4)", | ||
"fileSize": "34937247", | ||
"fileSizeUnit": "B", | ||
"fileType": "ZIP", | ||
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-scfi-route-schedule-data-transxchange_2_4.zip", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"numRecords": null | ||
}, | ||
{ | ||
"fileName": "Fares", | ||
"fileSize": "7533115", | ||
"fileSizeUnit": "B", | ||
"fileType": "ZIP", | ||
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-scfi-fares-data-netex.zip", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"numRecords": null | ||
} | ||
] | ||
}, | ||
{ | ||
"title": "Stagecoach Highlands - Schedules and Fares", | ||
"owner": "Stagecoach", | ||
"pageURL": "https://www.stagecoachbus.com/open-data", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"licence": "UNKNOWN", | ||
"description": "Schedules and Fares data for the Stagecoach Highlands region", | ||
"tags": [], | ||
"resources": [ | ||
{ | ||
"fileName": "Schedules (TXC 2.1)", | ||
"fileSize": "12596019", | ||
"fileSizeUnit": "B", | ||
"fileType": "ZIP", | ||
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-schi-route-schedule-data-transxchange.zip", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"numRecords": null | ||
}, | ||
{ | ||
"fileName": "Schedules (TXC 2.4)", | ||
"fileSize": "13092518", | ||
"fileSizeUnit": "B", | ||
"fileType": "ZIP", | ||
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-schi-route-schedule-data-transxchange_2_4.zip", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"numRecords": null | ||
}, | ||
{ | ||
"fileName": "Fares", | ||
"fileSize": "831985", | ||
"fileSizeUnit": "B", | ||
"fileType": "ZIP", | ||
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-schi-fares-data-netex.zip", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"numRecords": null | ||
} | ||
] | ||
}, | ||
{ | ||
"title": "Stagecoach West Scotland - Schedules and Fares", | ||
"owner": "Stagecoach", | ||
"pageURL": "https://www.stagecoachbus.com/open-data", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"licence": "UNKNOWN", | ||
"description": "Schedules and Fares data for the Stagecoach West Scotland region", | ||
"tags": [], | ||
"resources": [ | ||
{ | ||
"fileName": "Schedules (TXC 2.1)", | ||
"fileSize": "12189221", | ||
"fileSizeUnit": "B", | ||
"fileType": "ZIP", | ||
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-stws-route-schedule-data-transxchange.zip", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"numRecords": null | ||
}, | ||
{ | ||
"fileName": "Schedules (TXC 2.4)", | ||
"fileSize": "16742761", | ||
"fileSizeUnit": "B", | ||
"fileType": "ZIP", | ||
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-stws-route-schedule-data-transxchange_2_4.zip", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"numRecords": null | ||
}, | ||
{ | ||
"fileName": "Fares", | ||
"fileSize": "2587759", | ||
"fileSizeUnit": "B", | ||
"fileType": "ZIP", | ||
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-stws-fares-data-netex.zip", | ||
"dateCreated": null, | ||
"dateUpdated": null, | ||
"numRecords": null | ||
} | ||
] | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
from bs4 import BeautifulSoup | ||
from processor import Processor | ||
|
||
|
||
class ProcessorStagecoach(Processor): | ||
"""Processor for Stagecoach's open data portal""" | ||
|
||
ACCEPTED_REGIONS = [ | ||
"stagecoach bluebird", | ||
"stagecoach east scotland", | ||
"stagecoach highlands", | ||
"stagecoach west scotland", | ||
] | ||
# TODO: Find out licence info from Stagecoach | ||
DATASETS_LICENCE = "UNKNOWN" | ||
|
||
def __init__(self): | ||
"""Base init for type and URL list""" | ||
super().__init__(type="bespoke_Stagecoach") | ||
|
||
def filter_rows(self, row): | ||
row_title = row.select_one("b") | ||
|
||
if row_title is None: | ||
return False | ||
|
||
return row_title.text.lower() in self.ACCEPTED_REGIONS | ||
|
||
def get_datasets(self, owner, url, fname): | ||
"""Gets datasets from provided portal and outputs to JSON""" | ||
print(f"Processing {url}") | ||
|
||
portal_html = processor.get_html(url) | ||
|
||
# PATCH: This page contains an unclosed <style> tag so we're closing it | ||
# TODO: Contact Stagecoach and ask if they can fix their HTML | ||
portal_html = portal_html.replace("</style\n", "</style>\n") | ||
|
||
parsed_portal_html = BeautifulSoup(portal_html, features="html.parser") | ||
dataset_list_rows = parsed_portal_html.select(".rich-text .row") | ||
|
||
# Remove non-Scottish rows | ||
filtered_dataset_list_rows = list(filter(self.filter_rows, dataset_list_rows)) | ||
|
||
print(f"Found {len(filtered_dataset_list_rows)} datasets") | ||
|
||
prepped_datasets = [] | ||
|
||
for region in filtered_dataset_list_rows: | ||
region_title = region.select_one("b").text | ||
|
||
dataset_owner = owner | ||
dataset_page_url = url | ||
dataset_date_created = None | ||
dataset_date_updated = None | ||
dataset_licence = self.DATASETS_LICENCE | ||
dataset_tags = [] | ||
|
||
# Check for presence of download buttons | ||
schedules_txc_2_1_button = region.select_one( | ||
"a:-soup-contains('Schedules (TXC 2.1)')" | ||
) | ||
schedules_txc_2_4_button = region.select_one( | ||
"a:-soup-contains('Schedules (TXC 2.4)')" | ||
) | ||
has_schedules = schedules_txc_2_1_button or schedules_txc_2_4_button | ||
fares_button = region.select_one("a:-soup-contains('Fares')") | ||
|
||
# Build title and description | ||
dataset_title = f"{region_title} - " | ||
dataset_description = None | ||
|
||
if has_schedules and fares_button: | ||
dataset_title += "Schedules and Fares" | ||
dataset_description = ( | ||
f"Schedules and Fares data for the {region_title} region" | ||
) | ||
elif has_schedules: | ||
dataset_title += "Schedules" | ||
dataset_description = f"Schedules data for the {region_title} region" | ||
elif fares_button: | ||
dataset_title += "Fares" | ||
dataset_description = f"Fares data for the {region_title} region" | ||
|
||
dataset_resources = [] | ||
|
||
buttons = [ | ||
button | ||
for button in [ | ||
schedules_txc_2_1_button, | ||
schedules_txc_2_4_button, | ||
fares_button, | ||
] | ||
if button != None | ||
] | ||
|
||
for button in buttons: | ||
asset_url = button["href"] | ||
file_size = processor.get_http_content_length(asset_url) | ||
dataset_resources.append( | ||
{ | ||
"fileName": button.text, | ||
"fileSize": file_size, | ||
"fileSizeUnit": "B", | ||
"fileType": "ZIP", | ||
"assetUrl": asset_url, | ||
"dateCreated": dataset_date_created, | ||
"dateUpdated": dataset_date_updated, | ||
"numRecords": None, | ||
} | ||
) | ||
|
||
prepped_datasets.append( | ||
{ | ||
"title": dataset_title, | ||
"owner": dataset_owner, | ||
"pageURL": dataset_page_url, | ||
"dateCreated": dataset_date_created, | ||
"dateUpdated": dataset_date_updated, | ||
"licence": dataset_licence, | ||
"description": dataset_description, | ||
"tags": dataset_tags, | ||
"resources": dataset_resources, | ||
} | ||
) | ||
|
||
processor.write_json(fname, prepped_datasets) | ||
|
||
|
||
processor = ProcessorStagecoach() | ||
|
||
if __name__ == "__main__": | ||
processor.process("json") |