Skip to content

Commit

Permalink
[MVV-2] Attach train station information to the public transport info…
Browse files Browse the repository at this point in the history
…rmation (#726)


Co-authored-by: Frank Elsinga <[email protected]>
  • Loading branch information
quarz12 and CommanderStorm authored Jul 22, 2023
1 parent a4d07f8 commit ce2504e
Showing 1 changed file with 91 additions and 38 deletions.
129 changes: 91 additions & 38 deletions data/external/scrapers/public_transport.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,107 @@
import csv
import logging
from zipfile import ZipFile

from external.scraping_utils import _download_file, CACHE_PATH, cached_json

MVV_GTFS_URL = "https://www.mvv-muenchen.de/fileadmin/mediapool/02-Fahrplanauskunft/03-Downloads/openData/mvv_gtfs.zip"
MVV_OPENDATA_URL = "https://www.mvv-muenchen.de/fileadmin/mediapool/02-Fahrplanauskunft/03-Downloads/openData"
MVV_GTFS_URL = f"{MVV_OPENDATA_URL}/mvv_gtfs.zip"
MVV_HST_REPORT_URL = f"{MVV_OPENDATA_URL}/MVV_HSTReport2212.csv" # train/tram stations + some bus stations
PUBLIC_TRANSPORT_CACHE_PATH = CACHE_PATH / "public_transport"


@cached_json("public_transport.json")
def scrape_stations():
"""Scrape the stations from the MVV GTFS data and return them as a list of dicts"""
def _load_bus_stations(stations: dict) -> None:
"""Load the bus stations from the MVV GTFS data and add them to stations dict"""
_download_file(MVV_GTFS_URL, PUBLIC_TRANSPORT_CACHE_PATH / "fahrplandaten.zip")
with ZipFile(PUBLIC_TRANSPORT_CACHE_PATH / "fahrplandaten.zip") as file_zip:
file_zip.extract("stops.txt", PUBLIC_TRANSPORT_CACHE_PATH)

with open(PUBLIC_TRANSPORT_CACHE_PATH / "stops.txt", encoding="utf-8") as file:
lines = csv.DictReader(file, delimiter=",")
stations = {}
repeat_later = [] # when parent station is not already in dict
for line in lines:
if line["location_type"]:
stations.setdefault(
line["stop_id"],
{
"station_id": line["stop_id"],
"name": line["stop_name"],
"lat": float(line["stop_lat"]),
"lon": float(line["stop_lon"]),
"sub_stations": [],
},
)
else:
sub_station = {
lines = list(csv.DictReader(file, delimiter=","))
repeat_later = [] # when parent station is not already in dict
for line in lines:
if line["location_type"]:
stations.setdefault(
line["stop_id"],
{
"station_id": line["stop_id"],
"name": line["stop_name"],
"lat": float(line["stop_lat"]),
"lon": float(line["stop_lon"]),
"parent": line["parent_station"],
}

if parent := stations.get(line["parent_station"]):
parent["sub_stations"].append(sub_station)
else:
repeat_later.append(sub_station)

for sub in repeat_later:
if parent := stations.get(sub["parent"]):
parent["sub_stations"].append(sub)
# remove parent property from sub stations
for station in stations.values():
for sub in station["sub_stations"]:
del sub["parent"]
return sorted(stations.values(), key=lambda x: x["lat"])
"sub_stations": [],
},
)
else:
sub_station = {
"station_id": line["stop_id"],
"name": line["stop_name"],
"lat": float(line["stop_lat"]),
"lon": float(line["stop_lon"]),
"parent": line["parent_station"],
}
if not sub_station["parent"]:
sub_station["parent"] = ":".join(line["stop_id"].split(":")[:3])

if parent := stations.get(line["parent_station"]):
parent["sub_stations"].append(sub_station)
else:
repeat_later.append(sub_station)

for sub in repeat_later:
if parent := stations.get(sub["parent"]):
parent["sub_stations"].append(sub)
else:
if sub["station_id"]:
logging.warning(f"{sub['name']} with id {sub['station_id']} has no parent in our data")


def _load_train_stations(stations: dict) -> None:
"""Load the bus stations from the MVV_HST_REPORT data and add them to stations dict"""
_download_file(MVV_HST_REPORT_URL, PUBLIC_TRANSPORT_CACHE_PATH / "train_stations.csv")
with open(PUBLIC_TRANSPORT_CACHE_PATH / "train_stations.csv", encoding="utf-8") as file:
lines = [line for line in csv.DictReader(file, delimiter=";") if line["\ufeffHstNummer"]]
repeat_later = [] # when parent station is not already in dict
for line in lines:
if line["Globale ID"].count(":") == 2: # example: de:09184:460
stations.setdefault(
line["Globale ID"],
{
"station_id": line["Globale ID"],
"name": line["Name ohne Ort"],
"lat": float(line["WGS84 X"].replace(",", ".")),
"lon": float(line["WGS84 Y"].replace(",", ".")),
"sub_stations": [],
},
)
else:
parent_id = ":".join(line["Globale ID"].split(":")[:3])
sub_station = {
"station_id": line["Globale ID"],
"name": line["Name ohne Ort"],
"lat": float(line["WGS84 X"].replace(",", ".")),
"lon": float(line["WGS84 Y"].replace(",", ".")),
"parent": parent_id,
}

if parent := stations.get(parent_id):
parent["sub_stations"].append(sub_station)
else:
repeat_later.append(sub_station)
for sub in repeat_later:
if parent := stations.get(sub["parent"]):
parent["sub_stations"].append(sub)
else:
if sub["station_id"]:
logging.warning(f"{sub['name']} with id {sub['station_id']} has no parent in our data")


@cached_json("public_transport.json")
def scrape_stations():
"""Scrape the stations from the MVV GTFS data and return them as a list of dicts"""
stations = {}
_load_train_stations(stations)
_load_bus_stations(stations)
# remove parent property from sub stations
for station in stations.values():
for sub in station["sub_stations"]:
del sub["parent"]
return sorted(stations.values(), key=lambda x: x["lat"])

0 comments on commit ce2504e

Please sign in to comment.