-
-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[MVV-2] Attach train station information to the public transport info…
…rmation (#726) Co-authored-by: Frank Elsinga <[email protected]>
- Loading branch information
1 parent
a4d07f8
commit ce2504e
Showing
1 changed file
with
91 additions
and
38 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,54 +1,107 @@ | ||
import csv | ||
import logging | ||
from zipfile import ZipFile | ||
|
||
from external.scraping_utils import _download_file, CACHE_PATH, cached_json | ||
|
||
MVV_GTFS_URL = "https://www.mvv-muenchen.de/fileadmin/mediapool/02-Fahrplanauskunft/03-Downloads/openData/mvv_gtfs.zip" | ||
MVV_OPENDATA_URL = "https://www.mvv-muenchen.de/fileadmin/mediapool/02-Fahrplanauskunft/03-Downloads/openData" | ||
MVV_GTFS_URL = f"{MVV_OPENDATA_URL}/mvv_gtfs.zip" | ||
MVV_HST_REPORT_URL = f"{MVV_OPENDATA_URL}/MVV_HSTReport2212.csv" # train/tram stations + some bus stations | ||
PUBLIC_TRANSPORT_CACHE_PATH = CACHE_PATH / "public_transport" | ||
|
||
|
||
@cached_json("public_transport.json") | ||
def scrape_stations(): | ||
"""Scrape the stations from the MVV GTFS data and return them as a list of dicts""" | ||
def _load_bus_stations(stations: dict) -> None: | ||
"""Load the bus stations from the MVV GTFS data and add them to stations dict""" | ||
_download_file(MVV_GTFS_URL, PUBLIC_TRANSPORT_CACHE_PATH / "fahrplandaten.zip") | ||
with ZipFile(PUBLIC_TRANSPORT_CACHE_PATH / "fahrplandaten.zip") as file_zip: | ||
file_zip.extract("stops.txt", PUBLIC_TRANSPORT_CACHE_PATH) | ||
|
||
with open(PUBLIC_TRANSPORT_CACHE_PATH / "stops.txt", encoding="utf-8") as file: | ||
lines = csv.DictReader(file, delimiter=",") | ||
stations = {} | ||
repeat_later = [] # when parent station is not already in dict | ||
for line in lines: | ||
if line["location_type"]: | ||
stations.setdefault( | ||
line["stop_id"], | ||
{ | ||
"station_id": line["stop_id"], | ||
"name": line["stop_name"], | ||
"lat": float(line["stop_lat"]), | ||
"lon": float(line["stop_lon"]), | ||
"sub_stations": [], | ||
}, | ||
) | ||
else: | ||
sub_station = { | ||
lines = list(csv.DictReader(file, delimiter=",")) | ||
repeat_later = [] # when parent station is not already in dict | ||
for line in lines: | ||
if line["location_type"]: | ||
stations.setdefault( | ||
line["stop_id"], | ||
{ | ||
"station_id": line["stop_id"], | ||
"name": line["stop_name"], | ||
"lat": float(line["stop_lat"]), | ||
"lon": float(line["stop_lon"]), | ||
"parent": line["parent_station"], | ||
} | ||
|
||
if parent := stations.get(line["parent_station"]): | ||
parent["sub_stations"].append(sub_station) | ||
else: | ||
repeat_later.append(sub_station) | ||
|
||
for sub in repeat_later: | ||
if parent := stations.get(sub["parent"]): | ||
parent["sub_stations"].append(sub) | ||
# remove parent property from sub stations | ||
for station in stations.values(): | ||
for sub in station["sub_stations"]: | ||
del sub["parent"] | ||
return sorted(stations.values(), key=lambda x: x["lat"]) | ||
"sub_stations": [], | ||
}, | ||
) | ||
else: | ||
sub_station = { | ||
"station_id": line["stop_id"], | ||
"name": line["stop_name"], | ||
"lat": float(line["stop_lat"]), | ||
"lon": float(line["stop_lon"]), | ||
"parent": line["parent_station"], | ||
} | ||
if not sub_station["parent"]: | ||
sub_station["parent"] = ":".join(line["stop_id"].split(":")[:3]) | ||
|
||
if parent := stations.get(line["parent_station"]): | ||
parent["sub_stations"].append(sub_station) | ||
else: | ||
repeat_later.append(sub_station) | ||
|
||
for sub in repeat_later: | ||
if parent := stations.get(sub["parent"]): | ||
parent["sub_stations"].append(sub) | ||
else: | ||
if sub["station_id"]: | ||
logging.warning(f"{sub['name']} with id {sub['station_id']} has no parent in our data") | ||
|
||
|
||
def _load_train_stations(stations: dict) -> None: | ||
"""Load the bus stations from the MVV_HST_REPORT data and add them to stations dict""" | ||
_download_file(MVV_HST_REPORT_URL, PUBLIC_TRANSPORT_CACHE_PATH / "train_stations.csv") | ||
with open(PUBLIC_TRANSPORT_CACHE_PATH / "train_stations.csv", encoding="utf-8") as file: | ||
lines = [line for line in csv.DictReader(file, delimiter=";") if line["\ufeffHstNummer"]] | ||
repeat_later = [] # when parent station is not already in dict | ||
for line in lines: | ||
if line["Globale ID"].count(":") == 2: # example: de:09184:460 | ||
stations.setdefault( | ||
line["Globale ID"], | ||
{ | ||
"station_id": line["Globale ID"], | ||
"name": line["Name ohne Ort"], | ||
"lat": float(line["WGS84 X"].replace(",", ".")), | ||
"lon": float(line["WGS84 Y"].replace(",", ".")), | ||
"sub_stations": [], | ||
}, | ||
) | ||
else: | ||
parent_id = ":".join(line["Globale ID"].split(":")[:3]) | ||
sub_station = { | ||
"station_id": line["Globale ID"], | ||
"name": line["Name ohne Ort"], | ||
"lat": float(line["WGS84 X"].replace(",", ".")), | ||
"lon": float(line["WGS84 Y"].replace(",", ".")), | ||
"parent": parent_id, | ||
} | ||
|
||
if parent := stations.get(parent_id): | ||
parent["sub_stations"].append(sub_station) | ||
else: | ||
repeat_later.append(sub_station) | ||
for sub in repeat_later: | ||
if parent := stations.get(sub["parent"]): | ||
parent["sub_stations"].append(sub) | ||
else: | ||
if sub["station_id"]: | ||
logging.warning(f"{sub['name']} with id {sub['station_id']} has no parent in our data") | ||
|
||
|
||
@cached_json("public_transport.json") | ||
def scrape_stations(): | ||
"""Scrape the stations from the MVV GTFS data and return them as a list of dicts""" | ||
stations = {} | ||
_load_train_stations(stations) | ||
_load_bus_stations(stations) | ||
# remove parent property from sub stations | ||
for station in stations.values(): | ||
for sub in station["sub_stations"]: | ||
del sub["parent"] | ||
return sorted(stations.values(), key=lambda x: x["lat"]) |