implemened better backoffing for the scrapers of the data pipeline

TUM-Dev · Apr 28, 2024 · 369d131 · 369d131
1 parent 530c393
commit 369d131
Show file tree

Hide file tree

Showing 6 changed files with 64 additions and 43 deletions.
diff --git a/data/external/scrapers/nat.py b/data/external/scrapers/nat.py
@@ -4,16 +4,19 @@
 from multiprocessing.pool import ThreadPool
 from pathlib import Path
 
+import backoff
 import requests
-from external.scraping_utils import _download_file, CACHE_PATH
 from tqdm import tqdm
 from tqdm.contrib.concurrent import thread_map
+
+from external.scraping_utils import _download_file, CACHE_PATH
 from utils import TranslatableStr as _
 
 NAT_API_URL = "https://api.srv.nat.tum.de/api/v1/rom"
 NAT_CACHE_DIR = CACHE_PATH / "nat"
 
 
+@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
 def scrape_buildings():
     """Retrieve the buildings as in the NAT roomfinder."""
     logging.info("Scraping the buildings of the NAT")
@@ -183,8 +186,9 @@ def _download_and_merge_room(base):
     """Download the room information and merge it with the base information."""
     room_code = base["room_code"]
     target_filepath = NAT_CACHE_DIR / f"room_{room_code}.json"
-    downloaded_file = _download_file(f"{NAT_API_URL}/{room_code}", target_filepath, quiet=True)
-    if not downloaded_file:
+    try:
+        downloaded_file = _download_file(f"{NAT_API_URL}/{room_code}", target_filepath)
+    except requests.exceptions.RequestException:
         return None
     content = json.loads(downloaded_file.read_text(encoding="utf-8"))
     for useless_key in ["events_end", "events_start"]:
@@ -230,13 +234,12 @@ def _get_base_room_infos():
 
 
 def _try_download_room_base_info(start: int, batch: int) -> tuple[tuple[int, int], Path | None]:
-    downloaded_file = _download_file(
-        f"{NAT_API_URL}/?limit={batch}&offset={start}",
-        NAT_CACHE_DIR / f"rooms_base_{start}_to_{start + batch - 1 }.json",
-        quiet=True,
-        quiet_errors=True,
-    )
-    return (start, batch), downloaded_file
+    try:
+        url = f"{NAT_API_URL}/?limit={batch}&offset={start}"
+        file_path = NAT_CACHE_DIR / f"rooms_base_{start}_to_{start + batch - 1}.json"
+        return (start, batch), _download_file(url, file_path)
+    except requests.exceptions.RequestException:
+        return (start, batch), None
 
 
 def _report_undownloadable(undownloadable: list[int]) -> None:

diff --git a/data/external/scrapers/roomfinder.py b/data/external/scrapers/roomfinder.py
@@ -9,6 +9,7 @@
 from pathlib import Path
 from typing import Literal, TypedDict
 
+import requests
 import utm
 from defusedxml import ElementTree as ET
 from tqdm import tqdm
@@ -203,7 +204,7 @@ def _download_maps(used_maps):
         # Download as file
         url = f"{ROOMFINDER_API_URL}/getMapImage?m_id={_map[1].removeprefix('rf')}"
         filepath = CACHE_PATH / "maps" / "roomfinder" / f"{_map[1]}.gif"
-        _download_file(url, filepath, quiet=True)
+        _download_file(url, filepath)
         convert_to_webp(filepath)
 
         map_data = {
@@ -244,9 +245,15 @@ def _download_map(_map_id: str, e_id: str, e_type: Literal["room", "building"])
     if e_type == "room":
         base_url = "https://portal.mytum.de/campus/roomfinder/getRoomPlacemark"
         url = f"{base_url}?roomid={urllib.parse.quote_plus(e_id)}&mapid={_map_id.removeprefix('rf')}"
-        return _download_file(url, filepath, quiet=True)
+        try:
+            _download_file(url, filepath)
+        except requests.exceptions.RequestException:
+            return None
     if e_type == "building":
         base_url = "https://portal.mytum.de/campus/roomfinder/getBuildingPlacemark"
         url = f"{base_url}?b_id={e_id}&mapid={_map_id.removeprefix('rf')}"
-        return _download_file(url, filepath, quiet=True)
+        try:
+            _download_file(url, filepath)
+        except requests.exceptions.RequestException:
+            return None
     raise RuntimeError(f"Unknown entity type: {e_type}")
diff --git a/data/external/scrapers/tumonline.py b/data/external/scrapers/tumonline.py
@@ -8,6 +8,7 @@
 import typing
 from pathlib import Path
 
+import backoff
 import requests
 from bs4 import BeautifulSoup, element
 from defusedxml import ElementTree as ET
@@ -193,6 +194,7 @@ def scrape_usages() -> None:
         json.dump(usages, file, indent=2, sort_keys=True)
 
 
+@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
 def scrape_orgs(lang: typing.Literal["de", "en"]) -> None:
     """
     Retrieve all organisations in TUMonline, that may operate rooms.
@@ -261,6 +263,12 @@ def merge(self, other: "ParsedRoomsList") -> "ParsedRoomsList":
         )
 
 
+@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
+def _tumonline_roomsearch(search_params) -> ParsedRoomsList:
+    req = requests.post(f"{TUMONLINE_URL}/wbSuche.raumSuche", data=search_params, timeout=30)
+    return _parse_rooms_list(BeautifulSoup(req.text, "lxml"))
+
+
 @functools.cache
 def _retrieve_roomlist(f_type: str, f_name: str, f_value: int, area_id: int = 0) -> list[ParsedRoom]:
     """Retrieve all rooms from the TUMonline room search list (multipage)"""
@@ -276,8 +284,7 @@ def _retrieve_roomlist(f_type: str, f_name: str, f_value: int, area_id: int = 0)
             "pVerwalter": 1,
             f_name: f_value,
         }
-        req = requests.post(f"{TUMONLINE_URL}/wbSuche.raumSuche", data=search_params, timeout=30)
-        rooms_list = _parse_rooms_list(BeautifulSoup(req.text, "lxml"))
+        rooms_list = _tumonline_roomsearch(search_params)
         scraped_rooms = scraped_rooms.merge(rooms_list)
 
         maybe_sleep(1.5)
@@ -412,6 +419,7 @@ def _get_roomsearch_xml(url: str, params: dict[str, str | int], cache_fname: str
     return BeautifulSoup(elem.text, "lxml")
 
 
+@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
 def _get_xml(url: str, params: dict[str, str | int], cache_fname: str) -> ET:
     cache_path = CACHE_PATH / cache_fname
     if cache_path.exists():
@@ -425,6 +433,7 @@ def _get_xml(url: str, params: dict[str, str | int], cache_fname: str) -> ET:
     return ET.fromstring(req.text)
 
 
+@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
 def _get_html(url: str, cached_xml_file: Path) -> BeautifulSoup:
     if cached_xml_file.exists():
         with open(cached_xml_file, encoding="utf-8") as file:

diff --git a/data/external/scraping_utils.py b/data/external/scraping_utils.py
@@ -1,28 +1,26 @@
-import logging
 import time
-import urllib.request
 from pathlib import Path
-from urllib.error import HTTPError
+
+import backoff
+import requests
 
 CACHE_PATH = Path(__file__).parent / "results"
 
 
 def maybe_sleep(duration: float) -> None:
     """Sleep for the given duration, but only if the script was called during a workday and working hours."""
-    if time.gmtime().tm_wday not in [5, 6] and 5 <= time.gmtime().tm_hour <= 22:
+    if time.gmtime().tm_wday not in [5, 6] and 7 <= time.gmtime().tm_hour <= 20:
         time.sleep(duration)
 
 
-def _download_file(url: str, target_cache_file: Path, quiet: bool = False, quiet_errors: bool = False) -> Path | None:
-    if not target_cache_file.exists():
-        # url parameter does not allow path traversal, because we build it further up in the callstack
-        try:
-            urllib.request.urlretrieve(url, target_cache_file)  # nosec: B310
-        except HTTPError as error:
-            if not quiet_errors:
-                logging.warning(f"GET {url} -> Failed to retrieve because: {error}")
-            return None
-        if not quiet:
-            logging.info(f"GET {url}")
-
+@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
+def _download_file(url: str, target_cache_file: Path) -> Path | None:
+    if target_cache_file.exists():
+        target_cache_file.unlink()
+    # url parameter does not allow path traversal, because we build it further up in the callstack
+    with requests.get(url, stream=True, timeout=10) as r:
+        r.raise_for_status()
+        with open(target_cache_file, "wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
     return target_cache_file
diff --git a/data/processors/sitemap.py b/data/processors/sitemap.py
@@ -5,8 +5,10 @@
 from pathlib import Path
 from typing import Literal, TypedDict
 
+import backoff
 import requests
 from defusedxml import ElementTree as defusedET
+
 from utils import DEBUG_MODE
 
 OLD_DATA_URL = "https://nav.tum.de/cdn/api_data.json"
@@ -48,7 +50,11 @@ def generate_sitemap() -> None:
     # sitemaps name. In case there aren't, we assume this sitemap is new,
     # and all entries will be marked as changed
     old_sitemaps = _download_online_sitemaps()
-    old_data = _download_old_data()
+    try:
+        old_data = _download_old_data()
+    except requests.exceptions.RequestException as error:
+        logging.warning(f"Could not download online data because of {error}. Assuming all entries are new.")
+        old_data = []
 
     sitemaps: Sitemaps = _extract_sitemap_data(new_data, old_data, old_sitemaps)
 
@@ -58,20 +64,17 @@ def generate_sitemap() -> None:
     _write_sitemapindex_xml(OUTPUT_DIR / "sitemap.xml", sitemaps)
 
 
+@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
 def _download_old_data() -> list:
     """Download the currently online data from the server"""
-    try:
-        req = requests.get(OLD_DATA_URL, headers={"Accept-Encoding": "gzip"}, timeout=120)
-        if req.status_code != 200:
-            logging.warning(f"Could not download online data because of {req.status_code=}. Assuming all are new")
-            return []
-        old_data = req.json()
-        if isinstance(old_data, dict):
-            old_data = list(old_data.values())
-        return old_data
-    except requests.exceptions.RequestException as error:
-        logging.warning(f"Could not download online data because of {error}. Assuming all entries are new.")
+    req = requests.get(OLD_DATA_URL, headers={"Accept-Encoding": "gzip"}, timeout=120)
+    if req.status_code != 200:
+        logging.warning(f"Could not download online data because of {req.status_code=}. Assuming all are new")
         return []
+    old_data = req.json()
+    if isinstance(old_data, dict):
+        old_data = list(old_data.values())
+    return old_data
 
 
 def _extract_sitemap_data(new_data: list, old_data: list, old_sitemaps: SimplifiedSitemaps) -> Sitemaps:

diff --git a/data/requirements.txt b/data/requirements.txt
@@ -1,3 +1,4 @@
+backoff~=2.2.1
 beautifulsoup4~=4.12.2
 defusedxml~=0.7.1
 lxml~=5.2.0