diff --git a/docs/md/download.md b/docs/md/download.md index 069f0af..247dd5c 100644 --- a/docs/md/download.md +++ b/docs/md/download.md @@ -1,6 +1,6 @@ # Web utilities -Some utility functions are provided for common web requests. Most use the GitHub API to query information or download artifacts and assets. See this project's test cases (in particular `test_download.py`) for detailed usage examples. +Some utility functions are provided for GitHub-releated web requests. See this project's test cases (in particular `test_download.py`) for detailed usage examples. **Note:** to avoid GitHub API rate limits when using these functions, it is recommended to set the `GITHUB_TOKEN` environment variable. If this variable is set, the token will be borne on requests sent to the API. @@ -18,56 +18,11 @@ assets = release["assets"] print([asset["name"] for asset in assets]) ``` -This yields `['code.json', 'linux.zip', 'mac.zip', 'win64.zip']`. - -Equivalently, using the `get_release_assets()` function to list the latest release assets directly: - -```python -from modflow_devtools.download import get_release_assets - -assets = get_release_assets("MODFLOW-USGS/executables") -print([asset["name"] for asset in assets]) -``` - -The `simple` parameter, defaulting to `False`, can be toggled to return a simple dictionary mapping asset names to download URLs: - -```python -from pprint import pprint - -assets = get_release_assets("MODFLOW-USGS/executables", simple=True) -pprint(assets) -``` - -This prints: - -``` -{'code.json': 'https://github.com/MODFLOW-USGS/executables/releases/download/12.0/code.json', - 'linux.zip': 'https://github.com/MODFLOW-USGS/executables/releases/download/12.0/linux.zip', - 'mac.zip': 'https://github.com/MODFLOW-USGS/executables/releases/download/12.0/mac.zip', - 'win64.zip': 'https://github.com/MODFLOW-USGS/executables/releases/download/12.0/win64.zip'} -``` +This prints `['code.json', 'linux.zip', 'mac.zip', 'win64.zip']`. ## Downloads -The `download_artifact` function downloads and unzips the GitHub Actions artifact with the given ID to the given path, optionally deleting the zipfile afterwards. The `repo` format is `owner/name`, as in GitHub URLs. For instance: - -```python -from modflow_devtools.download import list_artifacts, download_artifact - -repo = "MODFLOW-USGS/modflow6" -artifacts = list_artifacts(repo, max_pages=1, verbose=True) -artifact = next(iter(artifacts), None) -if artifact: - download_artifact( - repo=repo, - id=artifact["id"], - path=function_tmpdir, - delete_zip=False, - verbose=False, - ) -``` - -The `download_and_unzip` function is a more generic alternative for downloading and unzipping files from arbitrary URLs. +The `download_and_unzip` function downloads and unzips zip files. For instance, to download a MODFLOW 6.4.1 Linux distribution and delete the zipfile after extracting: diff --git a/modflow_devtools/download.py b/modflow_devtools/download.py index 3bdb5e8..c4dec7a 100644 --- a/modflow_devtools/download.py +++ b/modflow_devtools/download.py @@ -6,8 +6,7 @@ import urllib.request from os import PathLike from pathlib import Path -from typing import Optional, Union -from uuid import uuid4 +from typing import Optional from warnings import warn from modflow_devtools.zip import MFZipFile @@ -219,215 +218,6 @@ def get_latest_version(repo, retries=3, verbose=False) -> str: return release["tag_name"] -def get_release_assets( - repo, tag="latest", simple=False, retries=3, verbose=False -) -> Union[dict, list[dict]]: - """ - Get assets corresponding to the given release. - - Parameters - ---------- - repo : str - The repository (format must be owner/name) - tag : str - The release tag to retrieve assets for - simple : bool - If True, return a dict mapping asset names to download URLs, otherwise (by - default) a list of dicts containing asset info as returned by the GitHub API - retries : int - The maximum number of retries for each request - verbose : bool - Whether to show verbose output - - Returns - ------- - A list of dicts if simple is False, one per release asset. - If simple is True, a dict mapping asset names to download URLs. - """ - - if "/" not in repo: - raise ValueError("repo format must be owner/name") - - if not isinstance(tag, str) or not any(tag): - raise ValueError("tag must be a non-empty string") - - if not isinstance(retries, int) or retries < 1: - raise ValueError("retries must be a positive int") - - release = get_release(repo, tag=tag, retries=retries, verbose=verbose) - return ( - {a["name"]: a["browser_download_url"] for a in release["assets"]} - if simple - else release["assets"] - ) - - -def list_artifacts( - repo, name=None, per_page=30, max_pages=10, retries=3, verbose=False -) -> list[dict]: - """ - List artifacts for the given repository, optionally filtering by name (exact match). - If more artifacts are available than will fit within the given page size, by default - requests are made until all artifacts are retrieved. The number of requests made can - be limited with the max_pages parameter. - - Parameters - ---------- - repo : str - The repository (format must be owner/name) - name : str - The artifact name (must be an exact match) - per_page : int - The number of artifacts to return per page (must be between 1-100, inclusive) - max_pages : int - The maximum number of pages to retrieve (i.e. the number of requests to make) - retries : int - The maximum number of retries for each request - verbose : bool - Whether to show verbose output - - Returns - ------- - A list of dictionaries, each containing information - about an artifact as returned by the GitHub API. - """ - - if "/" not in repo: - raise ValueError("repo format must be owner/name") - - if not isinstance(retries, int) or retries < 1: - raise ValueError("retries must be a positive int") - - msg = f"artifact(s) for {repo}" + (f" matching name {name}" if name else "") - req_url = f"https://api.github.com/repos/{repo}/actions/artifacts" - page = 1 - params = {} - - if name is not None: - if not isinstance(name, str) or len(name) == 0: - raise ValueError("name must be a non-empty string") - params["name"] = name - - if per_page is not None: - if per_page < 1 or per_page > 100: - raise ValueError("per_page must be between 1 and 100") - params["per_page"] = int(per_page) - - def get_response_json(): - tries = 0 - params["page"] = page - request = get_request(req_url, params=params) - while True: - tries += 1 - try: - if verbose: - print(f"Fetching {msg} (page {page}, {per_page} per page)") - with urllib.request.urlopen(request, timeout=10) as resp: - return json.loads(resp.read().decode()) - except urllib.error.HTTPError as err: - if err.code == 401 and os.environ.get("GITHUB_TOKEN"): - raise ValueError("GITHUB_TOKEN env is invalid") from err - elif err.code == 403 and "rate limit exceeded" in err.reason: - raise ValueError( - f"use GITHUB_TOKEN env to bypass rate limit ({err})" - ) from err - elif err.code in (404, 503) and tries < retries: - # GitHub sometimes returns this error for valid URLs, so retry - warn(f"URL request try {tries} failed ({err})") - continue - raise RuntimeError(f"cannot retrieve data from {req_url}") from err - - artifacts = [] - diff = 1 - max_pages = max_pages if max_pages else sys.maxsize - while diff > 0 and page <= max_pages: - result = get_response_json() - total = result["total_count"] - if page == 1: - print(f"Repo {repo} has {total} artifact(s)") - - page += 1 - artifacts.extend(result["artifacts"]) - diff = total - len(artifacts) - - if verbose: - print(f"Found {len(artifacts)} {msg}") - - return artifacts - - -def download_artifact( - repo, - id, - path: Optional[PathLike] = None, - delete_zip=True, - retries=3, - verbose=False, -): - """ - Download and unzip a GitHub Actions artifact, selected by its ID. - - Parameters - ---------- - repo : str - The repository (format must be owner/name) - id : str - The artifact ID - path : PathLike - Path where the zip file will be saved (default is current path) - delete_zip : bool - Whether the zip file should be deleted after it is unzipped (default is True) - retries : int - The maximum number of retries for each request - verbose : bool - Whether to show verbose output - """ - - if "/" not in repo: - raise ValueError("repo format must be owner/name") - - if not isinstance(retries, int) or retries < 1: - raise ValueError("retries must be a positive int") - - req_url = f"https://api.github.com/repos/{repo}/actions/artifacts/{id}/zip" - request = urllib.request.Request(req_url) - if "github.com" in req_url: - github_token = os.environ.get("GITHUB_TOKEN", None) - if github_token: - request.add_header("Authorization", f"Bearer {github_token}") - - zip_path = Path(path).expanduser().absolute() / f"{uuid4()!s}.zip" - tries = 0 - while True: - tries += 1 - try: - with ( - urllib.request.urlopen(request) as url_file, - zip_path.open("wb") as out_file, - ): - content = url_file.read() - out_file.write(content) - break - except urllib.error.HTTPError as err: - if tries < retries: - warn(f"URL request try {tries} failed ({err})") - continue - else: - raise RuntimeError(f"cannot retrieve data from {req_url}") from err - - if verbose: - print(f"Uncompressing: {zip_path}") - - z = MFZipFile(zip_path) - z.extractall(str(path)) - z.close() - - if delete_zip: - if verbose: - print(f"Deleting zipfile {zip_path}") - zip_path.unlink() - - def download_and_unzip( url: str, path: Optional[PathLike] = None,