Skip to content

Commit

Permalink
refactor(download): remove asset/artifact utilities (MODFLOW-USGS#175)
Browse files Browse the repository at this point in the history
Remove functions for listing assets and downloading artifacts. We can just use the GH CLI/API and/or download-artifact in CI workflows. There is always PyGithub if something similar is really needed.
  • Loading branch information
wpbonelli authored Jan 17, 2025
1 parent 5074b21 commit 6957554
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 302 deletions.
43 changes: 0 additions & 43 deletions autotest/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@

from modflow_devtools.download import (
download_and_unzip,
download_artifact,
get_release,
get_releases,
list_artifacts,
)
from modflow_devtools.markers import requires_github

Expand Down Expand Up @@ -60,47 +58,6 @@ def test_get_release(repo):
assert set(actual_names) >= set(expected_names)


@flaky
@requires_github
@pytest.mark.parametrize("name", [None, "rtd-files", "run-time-comparison"])
@pytest.mark.parametrize("per_page", [None, 100])
def test_list_artifacts(name, per_page):
artifacts = list_artifacts(
"MODFLOW-USGS/modflow6",
name=name,
per_page=per_page,
max_pages=2,
verbose=True,
)

if any(artifacts) and name:
assert all(name == a["name"] for a in artifacts)


@flaky
@requires_github
@pytest.mark.parametrize("delete_zip", [True, False])
def test_download_artifact(function_tmpdir, delete_zip):
repo = "MODFLOW-USGS/modflow6"
artifacts = list_artifacts(repo, max_pages=1, verbose=True)
first = next(iter(artifacts), None)

if not first:
pytest.skip(f"No artifacts found for repo: {repo}")

artifact_id = first["id"]
download_artifact(
repo=repo,
id=artifact_id,
path=function_tmpdir,
delete_zip=delete_zip,
verbose=False,
)

assert len(list(function_tmpdir.rglob("*"))) >= (0 if delete_zip else 1)
assert any(list(function_tmpdir.rglob("*.zip"))) != delete_zip


@flaky
@requires_github
@pytest.mark.parametrize("delete_zip", [True, False])
Expand Down
51 changes: 3 additions & 48 deletions docs/md/download.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Web utilities

Some utility functions are provided for common web requests. Most use the GitHub API to query information or download artifacts and assets. See this project's test cases (in particular `test_download.py`) for detailed usage examples.
Some utility functions are provided for GitHub-related web requests. See this project's test cases (in particular `test_download.py`) for detailed usage examples.

**Note:** to avoid GitHub API rate limits when using these functions, it is recommended to set the `GITHUB_TOKEN` environment variable. If this variable is set, the token will be borne on requests sent to the API.

Expand All @@ -18,56 +18,11 @@ assets = release["assets"]
print([asset["name"] for asset in assets])
```

This yields `['code.json', 'linux.zip', 'mac.zip', 'win64.zip']`.

Equivalently, using the `get_release_assets()` function to list the latest release assets directly:

```python
from modflow_devtools.download import get_release_assets

assets = get_release_assets("MODFLOW-USGS/executables")
print([asset["name"] for asset in assets])
```

The `simple` parameter, defaulting to `False`, can be toggled to return a simple dictionary mapping asset names to download URLs:

```python
from pprint import pprint

assets = get_release_assets("MODFLOW-USGS/executables", simple=True)
pprint(assets)
```

This prints:

```
{'code.json': 'https://github.com/MODFLOW-USGS/executables/releases/download/12.0/code.json',
'linux.zip': 'https://github.com/MODFLOW-USGS/executables/releases/download/12.0/linux.zip',
'mac.zip': 'https://github.com/MODFLOW-USGS/executables/releases/download/12.0/mac.zip',
'win64.zip': 'https://github.com/MODFLOW-USGS/executables/releases/download/12.0/win64.zip'}
```
This prints `['code.json', 'linux.zip', 'mac.zip', 'win64.zip']`.

## Downloads

The `download_artifact` function downloads and unzips the GitHub Actions artifact with the given ID to the given path, optionally deleting the zipfile afterwards. The `repo` format is `owner/name`, as in GitHub URLs. For instance:

```python
from modflow_devtools.download import list_artifacts, download_artifact

repo = "MODFLOW-USGS/modflow6"
artifacts = list_artifacts(repo, max_pages=1, verbose=True)
artifact = next(iter(artifacts), None)
if artifact:
download_artifact(
repo=repo,
id=artifact["id"],
path=function_tmpdir,
delete_zip=False,
verbose=False,
)
```

The `download_and_unzip` function is a more generic alternative for downloading and unzipping files from arbitrary URLs.
The `download_and_unzip` function downloads and unzips zip files.

For instance, to download a MODFLOW 6.4.1 Linux distribution and delete the zipfile after extracting:

Expand Down
212 changes: 1 addition & 211 deletions modflow_devtools/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
import urllib.request
from os import PathLike
from pathlib import Path
from typing import Optional, Union
from uuid import uuid4
from typing import Optional
from warnings import warn

from modflow_devtools.zip import MFZipFile
Expand Down Expand Up @@ -219,215 +218,6 @@ def get_latest_version(repo, retries=3, verbose=False) -> str:
return release["tag_name"]


def get_release_assets(
repo, tag="latest", simple=False, retries=3, verbose=False
) -> Union[dict, list[dict]]:
"""
Get assets corresponding to the given release.
Parameters
----------
repo : str
The repository (format must be owner/name)
tag : str
The release tag to retrieve assets for
simple : bool
If True, return a dict mapping asset names to download URLs, otherwise (by
default) a list of dicts containing asset info as returned by the GitHub API
retries : int
The maximum number of retries for each request
verbose : bool
Whether to show verbose output
Returns
-------
A list of dicts if simple is False, one per release asset.
If simple is True, a dict mapping asset names to download URLs.
"""

if "/" not in repo:
raise ValueError("repo format must be owner/name")

if not isinstance(tag, str) or not any(tag):
raise ValueError("tag must be a non-empty string")

if not isinstance(retries, int) or retries < 1:
raise ValueError("retries must be a positive int")

release = get_release(repo, tag=tag, retries=retries, verbose=verbose)
return (
{a["name"]: a["browser_download_url"] for a in release["assets"]}
if simple
else release["assets"]
)


def list_artifacts(
repo, name=None, per_page=30, max_pages=10, retries=3, verbose=False
) -> list[dict]:
"""
List artifacts for the given repository, optionally filtering by name (exact match).
If more artifacts are available than will fit within the given page size, by default
requests are made until all artifacts are retrieved. The number of requests made can
be limited with the max_pages parameter.
Parameters
----------
repo : str
The repository (format must be owner/name)
name : str
The artifact name (must be an exact match)
per_page : int
The number of artifacts to return per page (must be between 1-100, inclusive)
max_pages : int
The maximum number of pages to retrieve (i.e. the number of requests to make)
retries : int
The maximum number of retries for each request
verbose : bool
Whether to show verbose output
Returns
-------
A list of dictionaries, each containing information
about an artifact as returned by the GitHub API.
"""

if "/" not in repo:
raise ValueError("repo format must be owner/name")

if not isinstance(retries, int) or retries < 1:
raise ValueError("retries must be a positive int")

msg = f"artifact(s) for {repo}" + (f" matching name {name}" if name else "")
req_url = f"https://api.github.com/repos/{repo}/actions/artifacts"
page = 1
params = {}

if name is not None:
if not isinstance(name, str) or len(name) == 0:
raise ValueError("name must be a non-empty string")
params["name"] = name

if per_page is not None:
if per_page < 1 or per_page > 100:
raise ValueError("per_page must be between 1 and 100")
params["per_page"] = int(per_page)

def get_response_json():
tries = 0
params["page"] = page
request = get_request(req_url, params=params)
while True:
tries += 1
try:
if verbose:
print(f"Fetching {msg} (page {page}, {per_page} per page)")
with urllib.request.urlopen(request, timeout=10) as resp:
return json.loads(resp.read().decode())
except urllib.error.HTTPError as err:
if err.code == 401 and os.environ.get("GITHUB_TOKEN"):
raise ValueError("GITHUB_TOKEN env is invalid") from err
elif err.code == 403 and "rate limit exceeded" in err.reason:
raise ValueError(
f"use GITHUB_TOKEN env to bypass rate limit ({err})"
) from err
elif err.code in (404, 503) and tries < retries:
# GitHub sometimes returns this error for valid URLs, so retry
warn(f"URL request try {tries} failed ({err})")
continue
raise RuntimeError(f"cannot retrieve data from {req_url}") from err

artifacts = []
diff = 1
max_pages = max_pages if max_pages else sys.maxsize
while diff > 0 and page <= max_pages:
result = get_response_json()
total = result["total_count"]
if page == 1:
print(f"Repo {repo} has {total} artifact(s)")

page += 1
artifacts.extend(result["artifacts"])
diff = total - len(artifacts)

if verbose:
print(f"Found {len(artifacts)} {msg}")

return artifacts


def download_artifact(
repo,
id,
path: Optional[PathLike] = None,
delete_zip=True,
retries=3,
verbose=False,
):
"""
Download and unzip a GitHub Actions artifact, selected by its ID.
Parameters
----------
repo : str
The repository (format must be owner/name)
id : str
The artifact ID
path : PathLike
Path where the zip file will be saved (default is current path)
delete_zip : bool
Whether the zip file should be deleted after it is unzipped (default is True)
retries : int
The maximum number of retries for each request
verbose : bool
Whether to show verbose output
"""

if "/" not in repo:
raise ValueError("repo format must be owner/name")

if not isinstance(retries, int) or retries < 1:
raise ValueError("retries must be a positive int")

req_url = f"https://api.github.com/repos/{repo}/actions/artifacts/{id}/zip"
request = urllib.request.Request(req_url)
if "github.com" in req_url:
github_token = os.environ.get("GITHUB_TOKEN", None)
if github_token:
request.add_header("Authorization", f"Bearer {github_token}")

zip_path = Path(path).expanduser().absolute() / f"{uuid4()!s}.zip"
tries = 0
while True:
tries += 1
try:
with (
urllib.request.urlopen(request) as url_file,
zip_path.open("wb") as out_file,
):
content = url_file.read()
out_file.write(content)
break
except urllib.error.HTTPError as err:
if tries < retries:
warn(f"URL request try {tries} failed ({err})")
continue
else:
raise RuntimeError(f"cannot retrieve data from {req_url}") from err

if verbose:
print(f"Uncompressing: {zip_path}")

z = MFZipFile(zip_path)
z.extractall(str(path))
z.close()

if delete_zip:
if verbose:
print(f"Deleting zipfile {zip_path}")
zip_path.unlink()


def download_and_unzip(
url: str,
path: Optional[PathLike] = None,
Expand Down

0 comments on commit 6957554

Please sign in to comment.