diff --git a/.github/workflows/update-data.yml b/.github/workflows/update-data.yml index 6e47eccb6..5e24c9301 100644 --- a/.github/workflows/update-data.yml +++ b/.github/workflows/update-data.yml @@ -24,12 +24,28 @@ jobs: python-version: '3.12' - name: Install python dependencies run: pip install -r data/requirements.txt -r requirements-dev.txt - - name: Download data + - run: rm -rf data/external/results/* + - name: Download public_transport data continue-on-error: true # a PR deleting all data will be created if this fails => fail obvious run: | - export PYTHONPATH=$PYTHONPATH:.. - python3 main.py - ls -lah results + PYTHONPATH=$PYTHONPATH:.. python3 scrapers/public_transport.py + working-directory: data/external + - name: Download NAT data + continue-on-error: true # a PR deleting all data will be created if this fails => fail obvious + run: | + PYTHONPATH=$PYTHONPATH:.. python3 scrapers/nat.py + working-directory: data/external + - name: Download tumonline data + continue-on-error: true # a PR deleting all data will be created if this fails => fail obvious + run: PYTHONPATH=$PYTHONPATH:.. python3 scrapers/tumonline.py + env: + CONNECTUM_OAUTH_CLIENT_ID: ${{ secrets.CONECTUM_OAUTH_CLIENT_ID }} + CONNECTUM_OAUTH_CLIENT_SECRET: ${{ secrets.CONECTUM_OAUTH_CLIENT_SECRET }} + working-directory: data/external + - name: Download mytum data + continue-on-error: true # a PR deleting all data will be created if this fails => fail obvious + run: | + PYTHONPATH=$PYTHONPATH:.. python3 scrapers/roomfinder.py sed -i 's/Bestelmeyer S\u00fcd/Zentralgeb\u00e4ude 2/g' results/buildings_roomfinder.json sed -i 's/Bestelmeyer Nord/Zentralgeb\u00e4ude 7/g' results/buildings_roomfinder.json sed -i 's/Bestelmeyer S\u00fcd/Zentralgeb\u00e4ude 2/g' results/maps_roomfinder.json @@ -38,8 +54,6 @@ jobs: sed -i 's/Bestelmeyer Nord/Zentralgeb\u00e4ude 7/g' results/rooms_roomfinder.json env: TQDM_MININTERVAL: 100 - CONNECTUM_OAUTH_CLIENT_ID: ${{ secrets.CONECTUM_OAUTH_CLIENT_ID }} - CONNECTUM_OAUTH_CLIENT_SECRET: ${{ secrets.CONECTUM_OAUTH_CLIENT_SECRET }} working-directory: data/external - continue-on-error: true # a PR deleting all data will be created if this fails => fail obvious run: ls -lah data/external/results diff --git a/data/README.md b/data/README.md index 0b9f7fde8..9f3ecb4a3 100644 --- a/data/README.md +++ b/data/README.md @@ -1,4 +1,4 @@ -# NavigaTUM Data Repository +# Data-Pipeline This folder contains: @@ -8,9 +8,10 @@ This folder contains: The code to retrieve external data, as well as externally retrieved data is located under `external`. -⚠️ A lot of this code is more a work in progress than finished. Especially features such as POIs, custom maps or other data types such as events are drafted but not yet fully implemented. - -Also, new external data might break the scripts from time to time, as either rooms or buildings are removed, the external data has errors or we make assumptions here that turn out to be wrong. +> [!WARNING] +> A lot of this code is more a work in progress than finished. Especially features such as POIs, custom maps or other data types such as events are drafted but not yet fully implemented. +> +> New external data might break the scripts from time to time, as either rooms or buildings are removed, the external data has errors or we make assumptions here that turn out to be wrong. ## Getting started @@ -34,30 +35,26 @@ pip install -r data/requirements.txt -r requirements-dev.txt ## Getting external data -External data (and the scrapers) are stored in the `external/` subdirectory. +> [!TIP] +> The latest scraped data is already included in the `external/results` directory, +> you do not need to run the scraping yourself and can skip to the next step -The latest scraped data is already included in this directory, you do not need to run the scraping yourself and can skip to the next step. +External data (and the scrapers) are stored in the `external/` subdirectory. -However, if you want to update the scraped data, open `external/main.py` and comment out all -steps depending on what specific data you want to scrape (Note that some steps depend on previous -steps. In this case, the downloader will automatically run these as well). +You can run a scraper from `external/scraper/`. +All newer scrapers are pretty quick => no shenanigans like commenting out lines are needed. -Then, start scraping with: +You can scrape with: ```bash cd external export PYTHONPATH=$PYTHONPATH:.. -python3 main.py -``` - -The data will be stored in the `cache` subdirectory as json files. To force a redownload, delete them. - -As a last step, move the `.json` files from the cache directory into the external directory, so that -it contains the most recent scraped results, and then go back: - -```bash -mv cache/buildings* cache/rooms* cache/maps* cache/usages* . -cd .. +python3 nat.py +python3 public_transport.py +python3 roomfinder.py +export CONNECTUM_OAUTH_CLIENT_ID=GIVEN_OUT_AS_NEEDED +export CONNECTUM_OAUTH_CLIENT_SECRET=GIVEN_OUT_AS_NEEDED +python3 tumonline.py ``` ### Compiling the data @@ -72,23 +69,23 @@ The exported datasets will be stored in `output/` as JSON files. ```bash data -├── external/ # 🠔 This is the sub-repository containing externally retrieved data -├── output/ # 🠔 Here the final, compiled datasets will be stored -├── processors/ # 🠔 Processing code -├── sources/ # 🠔 Custom data and patches +├── external/ +│ ├── output/ # 🠔 Here the final, compiled datasets will be stored +│ └── scrapers/ # how we download +├── processors/ # Processing code +├── sources/ # Custom data and patches │ ├── img/ │ └── -├── compile.py # 🠔 The main script -└── data-format_*.yaml # 🠔 Data format specification +└── compile.py # The main script to compile the datasources into our data representation ``` Deployment related there are also these files: ```bash data -├── Dockerfile # 🠔 Main dockerfile, in the deployment this is sometimes called the cdn -├── ngnix.conf # 🠔 ngnix cofigureation file used by above Dockerfile -└── requirements.txt # 🠔 python dependencys +├── Dockerfile # Main dockerfile, in the deployment this is sometimes called the cdn +├── ngnix.conf # nginx configuration file used by above Dockerfile +└── requirements.txt # python dependency's ``` ### How the data looks like @@ -142,9 +139,11 @@ Details about the formatting are given at the head of the file. The source data (i.e. all files located in `sources/` that are not images) is made available under the Open Database License: . Any rights in individual contents of the database are licensed under the Database Contents License: . -The images in `sources/img/` are subject to their own licensing terms, which are stated in the file `sources/img/img-sources.yaml`. +> [!WARNING] +> The images in `sources/img/` are subject to their own licensing terms, which are stated in the file `sources/img/img-sources.yaml`. -_Please note that the compiled database may contain contents from external sources (i.e. all files in `external`) that do have different license terms._ +> [!WARNING] +> The compiled database may contain contents from external sources (i.e. all files in `external/`) that do have different license terms. --- diff --git a/data/external/main.py b/data/external/main.py deleted file mode 100644 index ca8803760..000000000 --- a/data/external/main.py +++ /dev/null @@ -1,64 +0,0 @@ -import logging -import os -from pathlib import Path - -from external.scrapers import nat, public_transport, roomfinder, tumonline -from external.scraping_utils import CACHE_PATH -from utils import setup_logging - - -def ensure_is_empty(directory: Path): - """ - Make the specified directory empty by recursively deleting all its contents. - - Args: - ---- - directory: The directory path to be emptied. - - Returns: - ------- - None - - """ - for item in directory.iterdir(): - if item.is_dir(): - try: - os.removedirs(item) - except OSError: - ensure_is_empty(item) - else: - item.unlink() - directory.rmdir() - os.makedirs(CACHE_PATH, exist_ok=True) - - -if __name__ == "__main__": - setup_logging(level=logging.INFO) - ensure_is_empty(CACHE_PATH) - - print("::group::downloading TUMonline information") - tumonline.scrape_buildings() - tumonline.scrape_rooms() - tumonline.scrape_usages() - tumonline.scrape_orgs(lang="de") - tumonline.scrape_orgs(lang="en") - print("::endgroup::") - - print("::group::downloading NAT information") - os.makedirs(CACHE_PATH / "nat", exist_ok=True) - nat.scrape_buildings() - nat.scrape_rooms() - print("::endgroup::") - - print("::group::downloading public_transport information") - os.makedirs(CACHE_PATH / "public_transport", exist_ok=True) - public_transport.scrape_stations() - print("::endgroup::") - - print("::group::downloading MyTUM information") - os.makedirs(CACHE_PATH / "maps" / "roomfinder", exist_ok=True) - os.makedirs(CACHE_PATH / "maps" / "roomfinder" / "kmz", exist_ok=True) - roomfinder.scrape_buildings() - roomfinder.scrape_rooms() - roomfinder.scrape_maps() - print("::endgroup::") diff --git a/data/external/scrapers/nat.py b/data/external/scrapers/nat.py index 02f0ef868..46d8b0741 100644 --- a/data/external/scrapers/nat.py +++ b/data/external/scrapers/nat.py @@ -10,7 +10,7 @@ from tqdm.contrib.concurrent import thread_map from external.scraping_utils import _download_file, CACHE_PATH -from utils import TranslatableStr as _ +from utils import TranslatableStr as _, setup_logging NAT_API_URL = "https://api.srv.nat.tum.de/api/v1/rom" NAT_CACHE_DIR = CACHE_PATH / "nat" @@ -269,3 +269,10 @@ def _join_room_hits(): with open(file_path, encoding="utf-8") as file: total_hits.extend(json.load(file)["hits"]) return total_hits + + +if __name__ == "__main__": + setup_logging(level=logging.INFO) + NAT_CACHE_DIR.mkdir(exist_ok=True) + scrape_buildings() + scrape_rooms() diff --git a/data/external/scrapers/public_transport.py b/data/external/scrapers/public_transport.py index 965699876..8d87199cf 100644 --- a/data/external/scrapers/public_transport.py +++ b/data/external/scrapers/public_transport.py @@ -4,6 +4,7 @@ from zipfile import ZipFile from external.scraping_utils import _download_file, CACHE_PATH +from utils import setup_logging MVV_OPENDATA_URL = "https://www.mvv-muenchen.de/fileadmin/mediapool/02-Fahrplanauskunft/03-Downloads/openData" MVV_GTFS_URL = f"{MVV_OPENDATA_URL}/mvv_gtfs.zip" @@ -118,3 +119,9 @@ def scrape_stations() -> None: stations = sorted(stations.values(), key=lambda x: x["lat"]) with open(CACHE_PATH / "public_transport.json", "w", encoding="utf-8") as file: json.dump(stations, file, indent=2, sort_keys=True) + + +if __name__ == "__main__": + setup_logging(level=logging.INFO) + PUBLIC_TRANSPORT_CACHE_PATH.mkdir(exist_ok=True) + scrape_stations() diff --git a/data/external/scrapers/roomfinder.py b/data/external/scrapers/roomfinder.py index a5c5b4af2..998fe1eed 100644 --- a/data/external/scrapers/roomfinder.py +++ b/data/external/scrapers/roomfinder.py @@ -1,6 +1,7 @@ import itertools import json import logging +import os import string import urllib.parse import xmlrpc.client # nosec: B411 @@ -15,7 +16,7 @@ from tqdm import tqdm from external.scraping_utils import _download_file, CACHE_PATH, maybe_sleep -from utils import convert_to_webp +from utils import convert_to_webp, setup_logging ROOMFINDER_API_URL = "http://roomfinder.ze.tum.de:8192" @@ -257,3 +258,12 @@ def _download_map(_map_id: str, e_id: str, e_type: Literal["room", "building"]) except requests.exceptions.RequestException: return None raise RuntimeError(f"Unknown entity type: {e_type}") + + +if __name__ == "__main__": + setup_logging(level=logging.INFO) + os.makedirs(CACHE_PATH / "maps" / "roomfinder", exist_ok=True) + os.makedirs(CACHE_PATH / "maps" / "roomfinder" / "kmz", exist_ok=True) + scrape_buildings() + scrape_rooms() + scrape_maps() diff --git a/data/external/scrapers/tumonline.py b/data/external/scrapers/tumonline.py index f38d44a35..e4fe732b7 100644 --- a/data/external/scrapers/tumonline.py +++ b/data/external/scrapers/tumonline.py @@ -9,6 +9,7 @@ from requests_oauthlib import OAuth2Session from external.scraping_utils import CACHE_PATH +from utils import setup_logging TUMONLINE_URL = "https://campus.tum.de/tumonline" CONNECTUM_URL = f"{TUMONLINE_URL}/co/connectum" @@ -130,3 +131,12 @@ def scrape_orgs(lang: typing.Literal["de", "en"]) -> None: with open(CACHE_PATH / f"orgs-{lang}_tumonline.json", "w", encoding="utf-8") as file: json.dump(orgs, file, indent=2, sort_keys=True) + + +if __name__ == "__main__": + setup_logging(level=logging.INFO) + scrape_buildings() + scrape_rooms() + scrape_usages() + scrape_orgs(lang="de") + scrape_orgs(lang="en")