diff --git a/data/external/scrapers/nat.py b/data/external/scrapers/nat.py index aecbb03af..02f0ef868 100644 --- a/data/external/scrapers/nat.py +++ b/data/external/scrapers/nat.py @@ -4,16 +4,19 @@ from multiprocessing.pool import ThreadPool from pathlib import Path +import backoff import requests -from external.scraping_utils import _download_file, CACHE_PATH from tqdm import tqdm from tqdm.contrib.concurrent import thread_map + +from external.scraping_utils import _download_file, CACHE_PATH from utils import TranslatableStr as _ NAT_API_URL = "https://api.srv.nat.tum.de/api/v1/rom" NAT_CACHE_DIR = CACHE_PATH / "nat" +@backoff.on_exception(backoff.expo, requests.exceptions.RequestException) def scrape_buildings(): """Retrieve the buildings as in the NAT roomfinder.""" logging.info("Scraping the buildings of the NAT") @@ -183,8 +186,9 @@ def _download_and_merge_room(base): """Download the room information and merge it with the base information.""" room_code = base["room_code"] target_filepath = NAT_CACHE_DIR / f"room_{room_code}.json" - downloaded_file = _download_file(f"{NAT_API_URL}/{room_code}", target_filepath, quiet=True) - if not downloaded_file: + try: + downloaded_file = _download_file(f"{NAT_API_URL}/{room_code}", target_filepath) + except requests.exceptions.RequestException: return None content = json.loads(downloaded_file.read_text(encoding="utf-8")) for useless_key in ["events_end", "events_start"]: @@ -230,13 +234,12 @@ def _get_base_room_infos(): def _try_download_room_base_info(start: int, batch: int) -> tuple[tuple[int, int], Path | None]: - downloaded_file = _download_file( - f"{NAT_API_URL}/?limit={batch}&offset={start}", - NAT_CACHE_DIR / f"rooms_base_{start}_to_{start + batch - 1 }.json", - quiet=True, - quiet_errors=True, - ) - return (start, batch), downloaded_file + try: + url = f"{NAT_API_URL}/?limit={batch}&offset={start}" + file_path = NAT_CACHE_DIR / f"rooms_base_{start}_to_{start + batch - 1}.json" + return (start, batch), _download_file(url, file_path) + except requests.exceptions.RequestException: + return (start, batch), None def _report_undownloadable(undownloadable: list[int]) -> None: diff --git a/data/external/scrapers/roomfinder.py b/data/external/scrapers/roomfinder.py index eacc0e44f..728c3ec3c 100644 --- a/data/external/scrapers/roomfinder.py +++ b/data/external/scrapers/roomfinder.py @@ -9,6 +9,7 @@ from pathlib import Path from typing import Literal, TypedDict +import requests import utm from defusedxml import ElementTree as ET from tqdm import tqdm @@ -203,7 +204,7 @@ def _download_maps(used_maps): # Download as file url = f"{ROOMFINDER_API_URL}/getMapImage?m_id={_map[1].removeprefix('rf')}" filepath = CACHE_PATH / "maps" / "roomfinder" / f"{_map[1]}.gif" - _download_file(url, filepath, quiet=True) + _download_file(url, filepath) convert_to_webp(filepath) map_data = { @@ -244,9 +245,15 @@ def _download_map(_map_id: str, e_id: str, e_type: Literal["room", "building"]) if e_type == "room": base_url = "https://portal.mytum.de/campus/roomfinder/getRoomPlacemark" url = f"{base_url}?roomid={urllib.parse.quote_plus(e_id)}&mapid={_map_id.removeprefix('rf')}" - return _download_file(url, filepath, quiet=True) + try: + _download_file(url, filepath) + except requests.exceptions.RequestException: + return None if e_type == "building": base_url = "https://portal.mytum.de/campus/roomfinder/getBuildingPlacemark" url = f"{base_url}?b_id={e_id}&mapid={_map_id.removeprefix('rf')}" - return _download_file(url, filepath, quiet=True) + try: + _download_file(url, filepath) + except requests.exceptions.RequestException: + return None raise RuntimeError(f"Unknown entity type: {e_type}") diff --git a/data/external/scrapers/tumonline.py b/data/external/scrapers/tumonline.py index 7b6b5d5c5..cf63e1ac1 100644 --- a/data/external/scrapers/tumonline.py +++ b/data/external/scrapers/tumonline.py @@ -8,6 +8,7 @@ import typing from pathlib import Path +import backoff import requests from bs4 import BeautifulSoup, element from defusedxml import ElementTree as ET @@ -193,6 +194,7 @@ def scrape_usages() -> None: json.dump(usages, file, indent=2, sort_keys=True) +@backoff.on_exception(backoff.expo, requests.exceptions.RequestException) def scrape_orgs(lang: typing.Literal["de", "en"]) -> None: """ Retrieve all organisations in TUMonline, that may operate rooms. @@ -261,6 +263,12 @@ def merge(self, other: "ParsedRoomsList") -> "ParsedRoomsList": ) +@backoff.on_exception(backoff.expo, requests.exceptions.RequestException) +def _tumonline_roomsearch(search_params) -> ParsedRoomsList: + req = requests.post(f"{TUMONLINE_URL}/wbSuche.raumSuche", data=search_params, timeout=30) + return _parse_rooms_list(BeautifulSoup(req.text, "lxml")) + + @functools.cache def _retrieve_roomlist(f_type: str, f_name: str, f_value: int, area_id: int = 0) -> list[ParsedRoom]: """Retrieve all rooms from the TUMonline room search list (multipage)""" @@ -276,8 +284,7 @@ def _retrieve_roomlist(f_type: str, f_name: str, f_value: int, area_id: int = 0) "pVerwalter": 1, f_name: f_value, } - req = requests.post(f"{TUMONLINE_URL}/wbSuche.raumSuche", data=search_params, timeout=30) - rooms_list = _parse_rooms_list(BeautifulSoup(req.text, "lxml")) + rooms_list = _tumonline_roomsearch(search_params) scraped_rooms = scraped_rooms.merge(rooms_list) maybe_sleep(1.5) @@ -412,6 +419,7 @@ def _get_roomsearch_xml(url: str, params: dict[str, str | int], cache_fname: str return BeautifulSoup(elem.text, "lxml") +@backoff.on_exception(backoff.expo, requests.exceptions.RequestException) def _get_xml(url: str, params: dict[str, str | int], cache_fname: str) -> ET: cache_path = CACHE_PATH / cache_fname if cache_path.exists(): @@ -425,6 +433,7 @@ def _get_xml(url: str, params: dict[str, str | int], cache_fname: str) -> ET: return ET.fromstring(req.text) +@backoff.on_exception(backoff.expo, requests.exceptions.RequestException) def _get_html(url: str, cached_xml_file: Path) -> BeautifulSoup: if cached_xml_file.exists(): with open(cached_xml_file, encoding="utf-8") as file: diff --git a/data/external/scraping_utils.py b/data/external/scraping_utils.py index f115d0105..9bb367350 100644 --- a/data/external/scraping_utils.py +++ b/data/external/scraping_utils.py @@ -1,28 +1,26 @@ -import logging import time -import urllib.request from pathlib import Path -from urllib.error import HTTPError + +import backoff +import requests CACHE_PATH = Path(__file__).parent / "results" def maybe_sleep(duration: float) -> None: """Sleep for the given duration, but only if the script was called during a workday and working hours.""" - if time.gmtime().tm_wday not in [5, 6] and 5 <= time.gmtime().tm_hour <= 22: + if time.gmtime().tm_wday not in [5, 6] and 7 <= time.gmtime().tm_hour <= 20: time.sleep(duration) -def _download_file(url: str, target_cache_file: Path, quiet: bool = False, quiet_errors: bool = False) -> Path | None: - if not target_cache_file.exists(): - # url parameter does not allow path traversal, because we build it further up in the callstack - try: - urllib.request.urlretrieve(url, target_cache_file) # nosec: B310 - except HTTPError as error: - if not quiet_errors: - logging.warning(f"GET {url} -> Failed to retrieve because: {error}") - return None - if not quiet: - logging.info(f"GET {url}") - +@backoff.on_exception(backoff.expo, requests.exceptions.RequestException) +def _download_file(url: str, target_cache_file: Path) -> Path | None: + if target_cache_file.exists(): + target_cache_file.unlink() + # url parameter does not allow path traversal, because we build it further up in the callstack + with requests.get(url, stream=True, timeout=10) as r: + r.raise_for_status() + with open(target_cache_file, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) return target_cache_file diff --git a/data/processors/sitemap.py b/data/processors/sitemap.py index 7b24b3f46..a8b8d6e54 100644 --- a/data/processors/sitemap.py +++ b/data/processors/sitemap.py @@ -5,8 +5,10 @@ from pathlib import Path from typing import Literal, TypedDict +import backoff import requests from defusedxml import ElementTree as defusedET + from utils import DEBUG_MODE OLD_DATA_URL = "https://nav.tum.de/cdn/api_data.json" @@ -48,7 +50,11 @@ def generate_sitemap() -> None: # sitemaps name. In case there aren't, we assume this sitemap is new, # and all entries will be marked as changed old_sitemaps = _download_online_sitemaps() - old_data = _download_old_data() + try: + old_data = _download_old_data() + except requests.exceptions.RequestException as error: + logging.warning(f"Could not download online data because of {error}. Assuming all entries are new.") + old_data = [] sitemaps: Sitemaps = _extract_sitemap_data(new_data, old_data, old_sitemaps) @@ -58,20 +64,17 @@ def generate_sitemap() -> None: _write_sitemapindex_xml(OUTPUT_DIR / "sitemap.xml", sitemaps) +@backoff.on_exception(backoff.expo, requests.exceptions.RequestException) def _download_old_data() -> list: """Download the currently online data from the server""" - try: - req = requests.get(OLD_DATA_URL, headers={"Accept-Encoding": "gzip"}, timeout=120) - if req.status_code != 200: - logging.warning(f"Could not download online data because of {req.status_code=}. Assuming all are new") - return [] - old_data = req.json() - if isinstance(old_data, dict): - old_data = list(old_data.values()) - return old_data - except requests.exceptions.RequestException as error: - logging.warning(f"Could not download online data because of {error}. Assuming all entries are new.") + req = requests.get(OLD_DATA_URL, headers={"Accept-Encoding": "gzip"}, timeout=120) + if req.status_code != 200: + logging.warning(f"Could not download online data because of {req.status_code=}. Assuming all are new") return [] + old_data = req.json() + if isinstance(old_data, dict): + old_data = list(old_data.values()) + return old_data def _extract_sitemap_data(new_data: list, old_data: list, old_sitemaps: SimplifiedSitemaps) -> Sitemaps: diff --git a/data/requirements.txt b/data/requirements.txt index f1cb20d1b..e553ab6b3 100644 --- a/data/requirements.txt +++ b/data/requirements.txt @@ -1,10 +1,11 @@ +backoff~=2.2.1 beautifulsoup4~=4.12.2 defusedxml~=0.7.1 lxml~=5.2.0 numba~=0.59.0rc1 Pillow~=10.3.0 pydantic~=2.7.0 -pytest~=8.1.1 +pytest~=8.2.0 pyyaml~=6.0 requests~=2.31.0 ruamel.yaml~=0.18.5 diff --git a/webclient/.eslintrc.cjs b/webclient/.eslintrc.cjs deleted file mode 100644 index 128333065..000000000 --- a/webclient/.eslintrc.cjs +++ /dev/null @@ -1,71 +0,0 @@ -/* eslint-env node */ -module.exports = { - root: true, - extends: [ - "plugin:vue/vue3-essential", - "plugin:vue/vue3-strongly-recommended", - "plugin:vue/vue3-recommended", - "eslint:recommended", - "@vue/eslint-config-typescript/recommended", - "@vue/eslint-config-prettier", - "@vue/eslint-config-prettier/skip-formatting", - ], - parserOptions: { - ecmaVersion: "latest", - }, - rules: { - "vue/no-v-html": "off", - "vue/block-lang": [ - "error", - { - script: { - lang: "ts", - }, - }, - ], - "vue/block-order": [ - "error", - { - order: ["script", "template", "style", "i18n"], - }, - ], - "vue/block-tag-newline": "error", - "vue/component-api-style": [ - "error", - ["script-setup", "composition"], // "script-setup", "composition", "composition-vue2", or "options" - ], - "vue/multi-word-component-names": "off", - "vue/component-name-in-template-casing": ["error", "PascalCase", { registeredComponentsOnly: false }], - "vue/custom-event-name-casing": ["error", "camelCase"], - "vue/define-macros-order": "error", - "vue/define-props-declaration": ["error", "type-based"], - "vue/html-button-has-type": [ - "error", - { - button: true, - submit: true, - reset: true, - }, - ], - "vue/no-boolean-default": ["error", "default-false"], - "vue/no-empty-component-block": "error", - "vue/html-comment-content-spacing": ["error", "always"], - "vue/no-ref-object-reactivity-loss": "error", - "vue/no-required-prop-with-default": "error", - "vue/no-restricted-call-after-await": "error", - //"vue/no-root-v-if": "error", todo: enable when there is a loading animation - "vue/no-setup-props-reactivity-loss": "error", - //"vue/no-static-inline-styles": "error", todo: enable after migration to tailwind - "vue/no-useless-mustaches": "error", - "vue/no-useless-v-bind": "error", - "vue/no-v-text": "error", - "vue/padding-line-between-blocks": "error", - "vue/prefer-prop-type-boolean-first": "error", - "vue/prefer-separate-static-class": "error", - "vue/require-macro-variable-name": "error", - "vue/require-typed-ref": "error", - "vue/static-class-names-order": "off", - "vue/v-for-delimiter-style": "error", - "vue/no-constant-condition": "error", - }, -}; diff --git a/webclient/.gitignore b/webclient/.gitignore index f01bc4f86..81d3f73cc 100644 --- a/webclient/.gitignore +++ b/webclient/.gitignore @@ -24,8 +24,3 @@ coverage *.njsproj *.sln *.sw? -/cdn/ - -# lockfiles -pnpm-lock.yaml -package-lock.json diff --git a/webclient/components/AppSearchBar.vue b/webclient/components/AppSearchBar.vue index 298cc156a..663d2d9a3 100644 --- a/webclient/components/AppSearchBar.vue +++ b/webclient/components/AppSearchBar.vue @@ -170,7 +170,7 @@ watchEffect(() => {