Skip to content

Commit

Permalink
update GNPSDownloader class to support gnps2 (#294)
Browse files Browse the repository at this point in the history
Updated GNPSDownloader class to support gnps2 data.
  • Loading branch information
CunliangGeng authored Jan 20, 2025
1 parent 817afa8 commit 9839a82
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 60 deletions.
71 changes: 40 additions & 31 deletions src/nplinker/metabolomics/gnps/gnps_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ class GNPSDownloader:
Attributes:
GNPS_DATA_DOWNLOAD_URL: URL template for downloading GNPS data.
GNPS_DATA_DOWNLOAD_URL_FBMN: URL template for downloading GNPS data for FBMN.
gnps_format: GNPS workflow type.
"""

GNPS_DATA_DOWNLOAD_URL: str = (
Expand All @@ -27,52 +26,59 @@ class GNPSDownloader:
GNPS_DATA_DOWNLOAD_URL_FBMN: str = (
"https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_cytoscape_data"
)
GNPS2_DATA_DOWNLOAD_URL: str = "https://gnps2.org/taskzip?task={}"

def __init__(self, task_id: str, download_root: str | PathLike):
def __init__(
self, task_id: str, download_root: str | PathLike, gnps_version: str = "1"
) -> None:
"""Initialize the GNPSDownloader.
Args:
task_id: GNPS task id, identifying the data to be downloaded.
download_root: Path where to store the downloaded archive.
gnps_version: Version of GNPS platform that has been used to run the task.
Available values are "1" and "2". Choose "1" if the platform https://gnps.ucsd.edu/
has been used; or "2" for the platform https://gnps2.org/.
Raises:
ValueError: If the given task id does not correspond to a supported
GNPS workflow.
ValueError: If the given task id does not correspond to a supported GNPS workflow.
ValueError: If the given GNPS version is not valid.
Examples:
>>> GNPSDownloader("c22f44b14a3d450eb836d607cb9521bb", "~/downloads")
"""
gnps_format = gnps_format_from_gnps1_task_id(task_id)
if gnps_format == GNPSFormat.Unknown:
if gnps_version == "1":
gnps_format = gnps_format_from_gnps1_task_id(task_id)
if gnps_format == GNPSFormat.Unknown:
raise ValueError(
f"Unknown workflow type for GNPS task '{task_id}'."
f"Supported GNPS workflows are described in the GNPSFormat enum, "
f"including such as 'METABOLOMICS-SNETS', 'METABOLOMICS-SNETS-V2' "
f"and 'FEATURE-BASED-MOLECULAR-NETWORKING'."
)
self._gnps_format = gnps_format
self._file_name = gnps_format.value + "-" + task_id + ".zip"
elif gnps_version == "2":
self._file_name = task_id + ".tar"
else:
raise ValueError(
f"Unknown workflow type for GNPS task '{task_id}'."
f"Supported GNPS workflows are described in the GNPSFormat enum, "
f"including such as 'METABOLOMICS-SNETS', 'METABOLOMICS-SNETS-V2' "
f"and 'FEATURE-BASED-MOLECULAR-NETWORKING'."
f"Invalid GNPS version '{gnps_version}'. Supported versions are '1' and '2'."
)

self._task_id = task_id
self._download_root: Path = Path(download_root)
self._gnps_format = gnps_format
self._file_name = gnps_format.value + "-" + self._task_id + ".zip"

@property
def gnps_format(self) -> GNPSFormat:
"""Get the GNPS workflow type.
Returns:
GNPS workflow type.
"""
return self._gnps_format
self._gnps_version = gnps_version

def download(self) -> Self:
"""Download GNPS data.
Note: GNPS data is downloaded using the POST method (empty payload is OK).
"""
download_url(
self.get_url(), self._download_root, filename=self._file_name, http_method="POST"
)
"""Download GNPS data."""
if self._gnps_version == "1":
download_url(
self.get_url(), self._download_root, filename=self._file_name, http_method="POST"
)
if self._gnps_version == "2":
download_url(
self.get_url(), self._download_root, filename=self._file_name, http_method="GET"
)
return self

def get_download_file(self) -> str:
Expand All @@ -97,6 +103,9 @@ def get_url(self) -> str:
Returns:
URL pointing to the GNPS data to be downloaded.
"""
if self.gnps_format == GNPSFormat.FBMN:
return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL_FBMN.format(self._task_id)
return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL.format(self._task_id)
if self._gnps_version == "1":
if self._gnps_format == GNPSFormat.FBMN:
return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL_FBMN.format(self._task_id)
return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL.format(self._task_id)
if self._gnps_version == "2":
return GNPSDownloader.GNPS2_DATA_DOWNLOAD_URL.format(self._task_id)
14 changes: 14 additions & 0 deletions tests/unit/metabolomics/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,20 @@ def gnps_annotations_files(tmp_gnps_dir) -> dict[GNPSFormat, PathLike]:
#


@pytest.fixture(scope="session")
def gnps2_website_is_down():
"""Check if the GNPS2 website is down."""
gnps_url = "https://gnps2.org"
try:
r = httpx.get(gnps_url, follow_redirects=True)
if r.is_success:
return False
else:
return True
except httpx.HTTPError:
return True


@pytest.fixture(scope="session")
def gnps2_tar_files() -> dict[GNPSFormat, PathLike]:
"""Get the paths of the GNPS2 tar archives as a dict.
Expand Down
86 changes: 57 additions & 29 deletions tests/unit/metabolomics/test_gnps_downloader.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
import tarfile
import zipfile
import pytest
from nplinker.metabolomics.gnps import GNPSDownloader
from nplinker.metabolomics.gnps import GNPSFormat


@pytest.fixture(scope="module", autouse=True)
def setup_with_fixture(gnps_website_is_down):
if gnps_website_is_down:
pytest.skip(
"GNPS website is down, skipping all tests in this module!", allow_module_level=True
)
def test_invalid_gnps_version(tmpdir):
with pytest.raises(ValueError, match="Invalid GNPS version '3'"):
GNPSDownloader("0ad6535e34d449788f297e712f43068a", tmpdir, "3")


def test_unknown_workflow(tmpdir):
Expand All @@ -18,37 +16,33 @@ def test_unknown_workflow(tmpdir):


@pytest.mark.parametrize(
"task_id, expected",
[
["92036537c21b44c29e509291e53f6382", GNPSFormat.FBMN],
["c22f44b14a3d450eb836d607cb9521bb", GNPSFormat.SNETS],
["189e8bf16af145758b0a900f1c44ff4a", GNPSFormat.SNETSV2],
],
)
def test_supported_workflows(task_id, expected, tmpdir):
downloader = GNPSDownloader(task_id, tmpdir)
assert downloader.gnps_format == expected


@pytest.mark.parametrize(
"task_id, filename",
"gnps_version, task_id, filename",
[
[
"1",
"92036537c21b44c29e509291e53f6382",
GNPSFormat.FBMN.value + "-92036537c21b44c29e509291e53f6382.zip",
],
[
"1",
"c22f44b14a3d450eb836d607cb9521bb",
GNPSFormat.SNETS.value + "-c22f44b14a3d450eb836d607cb9521bb.zip",
],
[
"1",
"189e8bf16af145758b0a900f1c44ff4a",
GNPSFormat.SNETSV2.value + "-189e8bf16af145758b0a900f1c44ff4a.zip",
],
[
"2",
"206a7b40b7ed41c1ae6b4fbd2def3636",
"206a7b40b7ed41c1ae6b4fbd2def3636.tar",
],
["2", "2014f321d72542afb5216c932e0d5079", "2014f321d72542afb5216c932e0d5079.tar"],
],
)
def test_get_download_file(task_id, filename, tmpdir):
downloader = GNPSDownloader(task_id, tmpdir)
def test_get_download_file(gnps_version, task_id, filename, tmpdir):
downloader = GNPSDownloader(task_id, tmpdir, gnps_version)
assert downloader.get_download_file() == tmpdir / filename


Expand All @@ -66,24 +60,37 @@ def test_get_task_id(task_id, tmpdir):


@pytest.mark.parametrize(
"task_id, url",
"gnps_version, task_id, url",
[
[
"1",
"92036537c21b44c29e509291e53f6382",
GNPSDownloader.GNPS_DATA_DOWNLOAD_URL_FBMN.format("92036537c21b44c29e509291e53f6382"),
],
[
"1",
"c22f44b14a3d450eb836d607cb9521bb",
GNPSDownloader.GNPS_DATA_DOWNLOAD_URL.format("c22f44b14a3d450eb836d607cb9521bb"),
],
[
"1",
"189e8bf16af145758b0a900f1c44ff4a",
GNPSDownloader.GNPS_DATA_DOWNLOAD_URL.format("189e8bf16af145758b0a900f1c44ff4a"),
],
[
"2",
"206a7b40b7ed41c1ae6b4fbd2def3636",
GNPSDownloader.GNPS2_DATA_DOWNLOAD_URL.format("206a7b40b7ed41c1ae6b4fbd2def3636"),
],
[
"2",
"2014f321d72542afb5216c932e0d5079",
GNPSDownloader.GNPS2_DATA_DOWNLOAD_URL.format("2014f321d72542afb5216c932e0d5079"),
],
],
)
def test_get_url(task_id, url, tmpdir):
downloader = GNPSDownloader(task_id, tmpdir)
def test_get_url(gnps_version, task_id, url, tmpdir):
downloader = GNPSDownloader(task_id, tmpdir, gnps_version)
assert downloader.get_url() == url


Expand All @@ -95,11 +102,32 @@ def test_get_url(task_id, url, tmpdir):
["189e8bf16af145758b0a900f1c44ff4a", GNPSFormat.SNETSV2],
],
)
def test_downloads_file(task_id, workflow, tmpdir, gnps_zip_files):
downloader = GNPSDownloader(task_id, tmpdir)
def test_download_gnps1(task_id, workflow, tmpdir, gnps_zip_files, gnps_website_is_down):
if gnps_website_is_down:
pytest.skip("GNPS website is down: https://gnps.ucsd.edu")
downloader = GNPSDownloader(task_id, tmpdir, gnps_version="1")
downloader.download()
actual = zipfile.ZipFile(downloader.get_download_file())
actual_names = actual.namelist()
expected = zipfile.ZipFile(gnps_zip_files[workflow])
expected_names = [x.filename for x in expected.filelist if x.compress_size > 0]
assert all(item in actual_names for item in expected_names)
expected_names = expected.namelist()
assert actual_names == expected_names


@pytest.mark.parametrize(
"task_id, workflow",
[
["2014f321d72542afb5216c932e0d5079", GNPSFormat.GNPS2FBMN],
["206a7b40b7ed41c1ae6b4fbd2def3636", GNPSFormat.GNPS2CN],
],
)
def test_download_gnps2(task_id, workflow, tmpdir, gnps2_tar_files, gnps2_website_is_down):
if gnps2_website_is_down:
pytest.skip("GNPS2 website is down: https://gnps2.org")
downloader = GNPSDownloader(task_id, tmpdir, gnps_version="2")
downloader.download()
actual = tarfile.open(downloader.get_download_file())
actual_names = actual.getnames()
expected = tarfile.open(gnps2_tar_files[workflow])
expected_names = expected.getnames()
assert actual_names == expected_names

0 comments on commit 9839a82

Please sign in to comment.