-
-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
implemened better backoffing for the scrapers of the data pipeline
- Loading branch information
1 parent
530c393
commit 369d131
Showing
6 changed files
with
64 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,28 +1,26 @@ | ||
import logging | ||
import time | ||
import urllib.request | ||
from pathlib import Path | ||
from urllib.error import HTTPError | ||
|
||
import backoff | ||
import requests | ||
|
||
CACHE_PATH = Path(__file__).parent / "results" | ||
|
||
|
||
def maybe_sleep(duration: float) -> None: | ||
"""Sleep for the given duration, but only if the script was called during a workday and working hours.""" | ||
if time.gmtime().tm_wday not in [5, 6] and 5 <= time.gmtime().tm_hour <= 22: | ||
if time.gmtime().tm_wday not in [5, 6] and 7 <= time.gmtime().tm_hour <= 20: | ||
time.sleep(duration) | ||
|
||
|
||
def _download_file(url: str, target_cache_file: Path, quiet: bool = False, quiet_errors: bool = False) -> Path | None: | ||
if not target_cache_file.exists(): | ||
# url parameter does not allow path traversal, because we build it further up in the callstack | ||
try: | ||
urllib.request.urlretrieve(url, target_cache_file) # nosec: B310 | ||
except HTTPError as error: | ||
if not quiet_errors: | ||
logging.warning(f"GET {url} -> Failed to retrieve because: {error}") | ||
return None | ||
if not quiet: | ||
logging.info(f"GET {url}") | ||
|
||
@backoff.on_exception(backoff.expo, requests.exceptions.RequestException) | ||
def _download_file(url: str, target_cache_file: Path) -> Path | None: | ||
if target_cache_file.exists(): | ||
target_cache_file.unlink() | ||
# url parameter does not allow path traversal, because we build it further up in the callstack | ||
with requests.get(url, stream=True, timeout=10) as r: | ||
r.raise_for_status() | ||
with open(target_cache_file, "wb") as f: | ||
for chunk in r.iter_content(chunk_size=8192): | ||
f.write(chunk) | ||
return target_cache_file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
backoff~=2.2.1 | ||
beautifulsoup4~=4.12.2 | ||
defusedxml~=0.7.1 | ||
lxml~=5.2.0 | ||
|