Skip to content

Commit

Permalink
#120 Stagecoach scraper script first pass
Browse files Browse the repository at this point in the history
  • Loading branch information
JackGilmore committed Oct 29, 2023
1 parent 4da1363 commit 13a72f1
Show file tree
Hide file tree
Showing 3 changed files with 305 additions and 1 deletion.
170 changes: 170 additions & 0 deletions data/bespoke_Stagecoach/Stagecoach.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
[
{
"title": "Stagecoach Bluebird - Schedules and Fares",
"owner": "Stagecoach",
"pageURL": "https://www.stagecoachbus.com/open-data",
"dateCreated": null,
"dateUpdated": null,
"licence": "UNKNOWN",
"description": "Schedules and Fares data for the Stagecoach Bluebird region",
"tags": [],
"resources": [
{
"fileName": "Schedules (TXC 2.1)",
"fileSize": "14221478",
"fileSizeUnit": "B",
"fileType": "ZIP",
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-sblb-route-schedule-data-transxchange.zip",
"dateCreated": null,
"dateUpdated": null,
"numRecords": null
},
{
"fileName": "Schedules (TXC 2.4)",
"fileSize": "18022731",
"fileSizeUnit": "B",
"fileType": "ZIP",
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-sblb-route-schedule-data-transxchange_2_4.zip",
"dateCreated": null,
"dateUpdated": null,
"numRecords": null
},
{
"fileName": "Fares",
"fileSize": "1659477",
"fileSizeUnit": "B",
"fileType": "ZIP",
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-sblb-fares-data-netex.zip",
"dateCreated": null,
"dateUpdated": null,
"numRecords": null
}
]
},
{
"title": "Stagecoach East Scotland - Schedules and Fares",
"owner": "Stagecoach",
"pageURL": "https://www.stagecoachbus.com/open-data",
"dateCreated": null,
"dateUpdated": null,
"licence": "UNKNOWN",
"description": "Schedules and Fares data for the Stagecoach East Scotland region",
"tags": [],
"resources": [
{
"fileName": "Schedules (TXC 2.1)",
"fileSize": "22545284",
"fileSizeUnit": "B",
"fileType": "ZIP",
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-scfi-route-schedule-data-transxchange.zip",
"dateCreated": null,
"dateUpdated": null,
"numRecords": null
},
{
"fileName": "Schedules (TXC 2.4)",
"fileSize": "34937247",
"fileSizeUnit": "B",
"fileType": "ZIP",
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-scfi-route-schedule-data-transxchange_2_4.zip",
"dateCreated": null,
"dateUpdated": null,
"numRecords": null
},
{
"fileName": "Fares",
"fileSize": "7533115",
"fileSizeUnit": "B",
"fileType": "ZIP",
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-scfi-fares-data-netex.zip",
"dateCreated": null,
"dateUpdated": null,
"numRecords": null
}
]
},
{
"title": "Stagecoach Highlands - Schedules and Fares",
"owner": "Stagecoach",
"pageURL": "https://www.stagecoachbus.com/open-data",
"dateCreated": null,
"dateUpdated": null,
"licence": "UNKNOWN",
"description": "Schedules and Fares data for the Stagecoach Highlands region",
"tags": [],
"resources": [
{
"fileName": "Schedules (TXC 2.1)",
"fileSize": "12596019",
"fileSizeUnit": "B",
"fileType": "ZIP",
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-schi-route-schedule-data-transxchange.zip",
"dateCreated": null,
"dateUpdated": null,
"numRecords": null
},
{
"fileName": "Schedules (TXC 2.4)",
"fileSize": "13092518",
"fileSizeUnit": "B",
"fileType": "ZIP",
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-schi-route-schedule-data-transxchange_2_4.zip",
"dateCreated": null,
"dateUpdated": null,
"numRecords": null
},
{
"fileName": "Fares",
"fileSize": "831985",
"fileSizeUnit": "B",
"fileType": "ZIP",
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-schi-fares-data-netex.zip",
"dateCreated": null,
"dateUpdated": null,
"numRecords": null
}
]
},
{
"title": "Stagecoach West Scotland - Schedules and Fares",
"owner": "Stagecoach",
"pageURL": "https://www.stagecoachbus.com/open-data",
"dateCreated": null,
"dateUpdated": null,
"licence": "UNKNOWN",
"description": "Schedules and Fares data for the Stagecoach West Scotland region",
"tags": [],
"resources": [
{
"fileName": "Schedules (TXC 2.1)",
"fileSize": "12189221",
"fileSizeUnit": "B",
"fileType": "ZIP",
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-stws-route-schedule-data-transxchange.zip",
"dateCreated": null,
"dateUpdated": null,
"numRecords": null
},
{
"fileName": "Schedules (TXC 2.4)",
"fileSize": "16742761",
"fileSizeUnit": "B",
"fileType": "ZIP",
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-stws-route-schedule-data-transxchange_2_4.zip",
"dateCreated": null,
"dateUpdated": null,
"numRecords": null
},
{
"fileName": "Fares",
"fileSize": "2587759",
"fileSizeUnit": "B",
"fileType": "ZIP",
"assetUrl": "https://opendata.stagecoachbus.com/stagecoach-stws-fares-data-netex.zip",
"dateCreated": null,
"dateUpdated": null,
"numRecords": null
}
]
}
]
3 changes: 2 additions & 1 deletion sources.csv
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ Public Health Scotland,https://www.opendata.nhs.scot/,ckan
Statistics Scottish Government, http://statistics.gov.scot/sparql,sparkql
Research Data Scotland, https://find.researchdata.scot/,ckan
Spatial Hub, https://data.spatialhub.scot/,ckan
Scottish Parliament,https://data.parliament.scot/,bespoke_ScottishParliament
Scottish Parliament,https://data.parliament.scot/,bespoke_ScottishParliament
Stagecoach,https://www.stagecoachbus.com/open-data,bespoke_Stagecoach
133 changes: 133 additions & 0 deletions stagecoach.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
from bs4 import BeautifulSoup
from processor import Processor


class ProcessorStagecoach(Processor):
"""Processor for Stagecoach's open data portal"""

ACCEPTED_REGIONS = [
"stagecoach bluebird",
"stagecoach east scotland",
"stagecoach highlands",
"stagecoach west scotland",
]
# TODO: Find out licence info from Stagecoach
DATASETS_LICENCE = "UNKNOWN"

def __init__(self):
"""Base init for type and URL list"""
super().__init__(type="bespoke_Stagecoach")

def filter_rows(self, row):
row_title = row.select_one("b")

if row_title is None:
return False

return row_title.text.lower() in self.ACCEPTED_REGIONS

def get_datasets(self, owner, url, fname):
"""Gets datasets from provided portal and outputs to JSON"""
print(f"Processing {url}")

portal_html = processor.get_html(url)

# PATCH: This page contains an unclosed <style> tag so we're closing it
# TODO: Contact Stagecoach and ask if they can fix their HTML
portal_html = portal_html.replace("</style\n", "</style>\n")

parsed_portal_html = BeautifulSoup(portal_html, features="html.parser")
dataset_list_rows = parsed_portal_html.select(".rich-text .row")

# Remove non-Scottish rows
filtered_dataset_list_rows = list(filter(self.filter_rows, dataset_list_rows))

print(f"Found {len(filtered_dataset_list_rows)} datasets")

prepped_datasets = []

for region in filtered_dataset_list_rows:
region_title = region.select_one("b").text

dataset_owner = owner
dataset_page_url = url
dataset_date_created = None
dataset_date_updated = None
dataset_licence = self.DATASETS_LICENCE
dataset_tags = []

# Check for presence of download buttons
schedules_txc_2_1_button = region.select_one(
"a:-soup-contains('Schedules (TXC 2.1)')"
)
schedules_txc_2_4_button = region.select_one(
"a:-soup-contains('Schedules (TXC 2.4)')"
)
has_schedules = schedules_txc_2_1_button or schedules_txc_2_4_button
fares_button = region.select_one("a:-soup-contains('Fares')")

# Build title and description
dataset_title = f"{region_title} - "
dataset_description = None

if has_schedules and fares_button:
dataset_title += "Schedules and Fares"
dataset_description = (
f"Schedules and Fares data for the {region_title} region"
)
elif has_schedules:
dataset_title += "Schedules"
dataset_description = f"Schedules data for the {region_title} region"
elif fares_button:
dataset_title += "Fares"
dataset_description = f"Fares data for the {region_title} region"

dataset_resources = []

buttons = [
button
for button in [
schedules_txc_2_1_button,
schedules_txc_2_4_button,
fares_button,
]
if button != None
]

for button in buttons:
asset_url = button["href"]
file_size = processor.get_http_content_length(asset_url)
dataset_resources.append(
{
"fileName": button.text,
"fileSize": file_size,
"fileSizeUnit": "B",
"fileType": "ZIP",
"assetUrl": asset_url,
"dateCreated": dataset_date_created,
"dateUpdated": dataset_date_updated,
"numRecords": None,
}
)

prepped_datasets.append(
{
"title": dataset_title,
"owner": dataset_owner,
"pageURL": dataset_page_url,
"dateCreated": dataset_date_created,
"dateUpdated": dataset_date_updated,
"licence": dataset_licence,
"description": dataset_description,
"tags": dataset_tags,
"resources": dataset_resources,
}
)

processor.write_json(fname, prepped_datasets)


processor = ProcessorStagecoach()

if __name__ == "__main__":
processor.process("json")

0 comments on commit 13a72f1

Please sign in to comment.