Skip to content

Commit

Permalink
implemened better backoffing for the scrapers of the data pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
CommanderStorm committed Apr 28, 2024
1 parent 530c393 commit 369d131
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 43 deletions.
23 changes: 13 additions & 10 deletions data/external/scrapers/nat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@
from multiprocessing.pool import ThreadPool
from pathlib import Path

import backoff
import requests
from external.scraping_utils import _download_file, CACHE_PATH
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map

from external.scraping_utils import _download_file, CACHE_PATH
from utils import TranslatableStr as _

NAT_API_URL = "https://api.srv.nat.tum.de/api/v1/rom"
NAT_CACHE_DIR = CACHE_PATH / "nat"


@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
def scrape_buildings():
"""Retrieve the buildings as in the NAT roomfinder."""
logging.info("Scraping the buildings of the NAT")
Expand Down Expand Up @@ -183,8 +186,9 @@ def _download_and_merge_room(base):
"""Download the room information and merge it with the base information."""
room_code = base["room_code"]
target_filepath = NAT_CACHE_DIR / f"room_{room_code}.json"
downloaded_file = _download_file(f"{NAT_API_URL}/{room_code}", target_filepath, quiet=True)
if not downloaded_file:
try:
downloaded_file = _download_file(f"{NAT_API_URL}/{room_code}", target_filepath)
except requests.exceptions.RequestException:
return None
content = json.loads(downloaded_file.read_text(encoding="utf-8"))
for useless_key in ["events_end", "events_start"]:
Expand Down Expand Up @@ -230,13 +234,12 @@ def _get_base_room_infos():


def _try_download_room_base_info(start: int, batch: int) -> tuple[tuple[int, int], Path | None]:
downloaded_file = _download_file(
f"{NAT_API_URL}/?limit={batch}&offset={start}",
NAT_CACHE_DIR / f"rooms_base_{start}_to_{start + batch - 1 }.json",
quiet=True,
quiet_errors=True,
)
return (start, batch), downloaded_file
try:
url = f"{NAT_API_URL}/?limit={batch}&offset={start}"
file_path = NAT_CACHE_DIR / f"rooms_base_{start}_to_{start + batch - 1}.json"
return (start, batch), _download_file(url, file_path)
except requests.exceptions.RequestException:
return (start, batch), None


def _report_undownloadable(undownloadable: list[int]) -> None:
Expand Down
13 changes: 10 additions & 3 deletions data/external/scrapers/roomfinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pathlib import Path
from typing import Literal, TypedDict

import requests
import utm
from defusedxml import ElementTree as ET
from tqdm import tqdm
Expand Down Expand Up @@ -203,7 +204,7 @@ def _download_maps(used_maps):
# Download as file
url = f"{ROOMFINDER_API_URL}/getMapImage?m_id={_map[1].removeprefix('rf')}"
filepath = CACHE_PATH / "maps" / "roomfinder" / f"{_map[1]}.gif"
_download_file(url, filepath, quiet=True)
_download_file(url, filepath)
convert_to_webp(filepath)

map_data = {
Expand Down Expand Up @@ -244,9 +245,15 @@ def _download_map(_map_id: str, e_id: str, e_type: Literal["room", "building"])
if e_type == "room":
base_url = "https://portal.mytum.de/campus/roomfinder/getRoomPlacemark"
url = f"{base_url}?roomid={urllib.parse.quote_plus(e_id)}&mapid={_map_id.removeprefix('rf')}"
return _download_file(url, filepath, quiet=True)
try:
_download_file(url, filepath)
except requests.exceptions.RequestException:
return None
if e_type == "building":
base_url = "https://portal.mytum.de/campus/roomfinder/getBuildingPlacemark"
url = f"{base_url}?b_id={e_id}&mapid={_map_id.removeprefix('rf')}"
return _download_file(url, filepath, quiet=True)
try:
_download_file(url, filepath)
except requests.exceptions.RequestException:
return None
raise RuntimeError(f"Unknown entity type: {e_type}")
13 changes: 11 additions & 2 deletions data/external/scrapers/tumonline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import typing
from pathlib import Path

import backoff
import requests
from bs4 import BeautifulSoup, element
from defusedxml import ElementTree as ET
Expand Down Expand Up @@ -193,6 +194,7 @@ def scrape_usages() -> None:
json.dump(usages, file, indent=2, sort_keys=True)


@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
def scrape_orgs(lang: typing.Literal["de", "en"]) -> None:
"""
Retrieve all organisations in TUMonline, that may operate rooms.
Expand Down Expand Up @@ -261,6 +263,12 @@ def merge(self, other: "ParsedRoomsList") -> "ParsedRoomsList":
)


@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
def _tumonline_roomsearch(search_params) -> ParsedRoomsList:
req = requests.post(f"{TUMONLINE_URL}/wbSuche.raumSuche", data=search_params, timeout=30)
return _parse_rooms_list(BeautifulSoup(req.text, "lxml"))


@functools.cache
def _retrieve_roomlist(f_type: str, f_name: str, f_value: int, area_id: int = 0) -> list[ParsedRoom]:
"""Retrieve all rooms from the TUMonline room search list (multipage)"""
Expand All @@ -276,8 +284,7 @@ def _retrieve_roomlist(f_type: str, f_name: str, f_value: int, area_id: int = 0)
"pVerwalter": 1,
f_name: f_value,
}
req = requests.post(f"{TUMONLINE_URL}/wbSuche.raumSuche", data=search_params, timeout=30)
rooms_list = _parse_rooms_list(BeautifulSoup(req.text, "lxml"))
rooms_list = _tumonline_roomsearch(search_params)
scraped_rooms = scraped_rooms.merge(rooms_list)

maybe_sleep(1.5)
Expand Down Expand Up @@ -412,6 +419,7 @@ def _get_roomsearch_xml(url: str, params: dict[str, str | int], cache_fname: str
return BeautifulSoup(elem.text, "lxml")


@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
def _get_xml(url: str, params: dict[str, str | int], cache_fname: str) -> ET:
cache_path = CACHE_PATH / cache_fname
if cache_path.exists():
Expand All @@ -425,6 +433,7 @@ def _get_xml(url: str, params: dict[str, str | int], cache_fname: str) -> ET:
return ET.fromstring(req.text)


@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
def _get_html(url: str, cached_xml_file: Path) -> BeautifulSoup:
if cached_xml_file.exists():
with open(cached_xml_file, encoding="utf-8") as file:
Expand Down
30 changes: 14 additions & 16 deletions data/external/scraping_utils.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,26 @@
import logging
import time
import urllib.request
from pathlib import Path
from urllib.error import HTTPError

import backoff
import requests

CACHE_PATH = Path(__file__).parent / "results"


def maybe_sleep(duration: float) -> None:
"""Sleep for the given duration, but only if the script was called during a workday and working hours."""
if time.gmtime().tm_wday not in [5, 6] and 5 <= time.gmtime().tm_hour <= 22:
if time.gmtime().tm_wday not in [5, 6] and 7 <= time.gmtime().tm_hour <= 20:
time.sleep(duration)


def _download_file(url: str, target_cache_file: Path, quiet: bool = False, quiet_errors: bool = False) -> Path | None:
if not target_cache_file.exists():
# url parameter does not allow path traversal, because we build it further up in the callstack
try:
urllib.request.urlretrieve(url, target_cache_file) # nosec: B310
except HTTPError as error:
if not quiet_errors:
logging.warning(f"GET {url} -> Failed to retrieve because: {error}")
return None
if not quiet:
logging.info(f"GET {url}")

@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
def _download_file(url: str, target_cache_file: Path) -> Path | None:
if target_cache_file.exists():
target_cache_file.unlink()
# url parameter does not allow path traversal, because we build it further up in the callstack
with requests.get(url, stream=True, timeout=10) as r:
r.raise_for_status()
with open(target_cache_file, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
return target_cache_file
27 changes: 15 additions & 12 deletions data/processors/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
from pathlib import Path
from typing import Literal, TypedDict

import backoff
import requests
from defusedxml import ElementTree as defusedET

from utils import DEBUG_MODE

OLD_DATA_URL = "https://nav.tum.de/cdn/api_data.json"
Expand Down Expand Up @@ -48,7 +50,11 @@ def generate_sitemap() -> None:
# sitemaps name. In case there aren't, we assume this sitemap is new,
# and all entries will be marked as changed
old_sitemaps = _download_online_sitemaps()
old_data = _download_old_data()
try:
old_data = _download_old_data()
except requests.exceptions.RequestException as error:
logging.warning(f"Could not download online data because of {error}. Assuming all entries are new.")
old_data = []

sitemaps: Sitemaps = _extract_sitemap_data(new_data, old_data, old_sitemaps)

Expand All @@ -58,20 +64,17 @@ def generate_sitemap() -> None:
_write_sitemapindex_xml(OUTPUT_DIR / "sitemap.xml", sitemaps)


@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
def _download_old_data() -> list:
"""Download the currently online data from the server"""
try:
req = requests.get(OLD_DATA_URL, headers={"Accept-Encoding": "gzip"}, timeout=120)
if req.status_code != 200:
logging.warning(f"Could not download online data because of {req.status_code=}. Assuming all are new")
return []
old_data = req.json()
if isinstance(old_data, dict):
old_data = list(old_data.values())
return old_data
except requests.exceptions.RequestException as error:
logging.warning(f"Could not download online data because of {error}. Assuming all entries are new.")
req = requests.get(OLD_DATA_URL, headers={"Accept-Encoding": "gzip"}, timeout=120)
if req.status_code != 200:
logging.warning(f"Could not download online data because of {req.status_code=}. Assuming all are new")
return []
old_data = req.json()
if isinstance(old_data, dict):
old_data = list(old_data.values())
return old_data


def _extract_sitemap_data(new_data: list, old_data: list, old_sitemaps: SimplifiedSitemaps) -> Sitemaps:
Expand Down
1 change: 1 addition & 0 deletions data/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
backoff~=2.2.1
beautifulsoup4~=4.12.2
defusedxml~=0.7.1
lxml~=5.2.0
Expand Down

0 comments on commit 369d131

Please sign in to comment.