Skip to content

Commit

Permalink
update gnpsformat detection function to support gnps2 (#293)
Browse files Browse the repository at this point in the history
Update function `gnps_format_from_archive` to support gnps2 data.
  • Loading branch information
CunliangGeng authored Jan 20, 2025
1 parent d79ee60 commit 817afa8
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 8 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ dependencies = [
"networkx",
"pandas",
"pyteomics",
"pyyaml",
"rich",
"scipy",
"sortedcontainers",
Expand Down
55 changes: 48 additions & 7 deletions src/nplinker/metabolomics/gnps/gnps_format.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from __future__ import annotations
import re
import tarfile
import zipfile
from enum import Enum
from enum import unique
from os import PathLike
from pathlib import Path
import httpx
import yaml
from bs4 import BeautifulSoup


Expand Down Expand Up @@ -72,17 +74,22 @@ def gnps_format_from_gnps1_task_id(task_id: str) -> GNPSFormat:
return GNPSFormat.Unknown


def gnps_format_from_archive(zip_file: str | PathLike) -> GNPSFormat:
"""Detect GNPS format from GNPS zip archive.
def gnps_format_from_archive(file: str | PathLike) -> GNPSFormat:
"""Detect GNPS format or workflow from GNPS archive file.
The detection is based on the filename of the zip file and the names of the
files contained in the zip file.
GNPS archive files can be in two formats: GNPS1 (.zip) and GNPS2 (.tar).
For GNPS1 data, the detection of workflow format is based on the filename of the zip archive and
the names of the files contained in the zip archive.
For GNPS2 data, the workflow format is taken from the `submission_parameters.yaml` file in the
tar archive, which has a key `workflowname`.
Args:
zip_file: Path to the GNPS zip file.
file: Path to the GNPS archive file.
Returns:
The format identified in the GNPS zip file.
The format identified in the GNPS archive file.
Examples:
>>> gnps_format_from_archive("ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip")
Expand All @@ -91,8 +98,22 @@ def gnps_format_from_archive(zip_file: str | PathLike) -> GNPSFormat:
<GNPSFormat.SNETSV2: 'METABOLOMICS-SNETS-V2'>
>>> gnps_format_from_archive("ProteoSAFe-FEATURE-BASED-MOLECULAR-NETWORKING-672d0a53-download_cytoscape_data.zip")
<GNPSFormat.FBMN: 'FEATURE-BASED-MOLECULAR-NETWORKING'>
>>> gnps_format_from_archive("206a7b40b7ed41c1ae6b4fbd2def3636.tar")
<GNPSFormat.GNPS2CN: 'classical_networking_workflow'>
>>> gnps_format_from_archive("2014f321d72542afb5216c932e0d5079.tar")
<GNPSFormat.GNPS2FBMN: 'feature_based_molecular_networking_workflow'>
"""
file = Path(zip_file)
file = Path(file)
suffix = file.suffix
if suffix == ".zip":
return _gnps_format_from_archive_gnps1(file)
if suffix == ".tar":
return _gnps_format_from_archive_gnps2(file)
return GNPSFormat.Unknown


def _gnps_format_from_archive_gnps1(file: PathLike) -> GNPSFormat:
"""Detect GNPS format from GNPS1 archive file."""
# Guess the format from the filename of the zip file
if GNPSFormat.FBMN.value in file.name:
return GNPSFormat.FBMN
Expand All @@ -116,6 +137,26 @@ def gnps_format_from_archive(zip_file: str | PathLike) -> GNPSFormat:
return GNPSFormat.Unknown


def _gnps_format_from_archive_gnps2(file: PathLike) -> GNPSFormat:
"""Detect GNPS format from GNPS2 archive file."""
with tarfile.open(file, "r") as tar:
try:
submission_file = tar.extractfile("submission_parameters.yaml")
if submission_file is None:
return GNPSFormat.Unknown
submission_params = yaml.safe_load(submission_file)
except (KeyError, yaml.YAMLError):
return GNPSFormat.Unknown

workflow = submission_params.get("workflowname")

if workflow == GNPSFormat.GNPS2FBMN.value:
return GNPSFormat.GNPS2FBMN
if workflow == GNPSFormat.GNPS2CN.value:
return GNPSFormat.GNPS2CN
return GNPSFormat.Unknown


def gnps_format_from_file_mapping(file: str | PathLike) -> GNPSFormat:
"""Detect GNPS format from the given file mapping file.
Expand Down
Binary file not shown.
Binary file not shown.
Binary file added tests/unit/data/gnps/gnps2_nnknown.tar
Binary file not shown.
28 changes: 28 additions & 0 deletions tests/unit/metabolomics/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
from .. import GNPS_DATA_DIR


#
# Fixtures for GNPS1
#


@pytest.fixture(scope="session")
def gnps_website_is_down():
"""Check if the GNPS website is down."""
Expand Down Expand Up @@ -133,3 +138,26 @@ def gnps_annotations_files(tmp_gnps_dir) -> dict[GNPSFormat, PathLike]:
/ "DB_result"
/ "7dc5b46b50d94246a1de12ef485d0f75.tsv",
}


#
# Fixtures for GNPS2
#


@pytest.fixture(scope="session")
def gnps2_tar_files() -> dict[GNPSFormat, PathLike]:
"""Get the paths of the GNPS2 tar archives as a dict.
The dict keys are the workflow short names taken from the GNPSFormat enum.
The dict values are the paths to the tar archives.
You can download the archives ("Download all results") from the following links :
- https://gnps2.org/status?task=2014f321d72542afb5216c932e0d5079
- https://gnps2.org/status?task=206a7b40b7ed41c1ae6b4fbd2def3636
"""
return {
GNPSFormat.GNPS2CN: GNPS_DATA_DIR / "206a7b40b7ed41c1ae6b4fbd2def3636.tar",
GNPSFormat.GNPS2FBMN: GNPS_DATA_DIR / "2014f321d72542afb5216c932e0d5079.tar",
GNPSFormat.Unknown: GNPS_DATA_DIR / "gnps2_nnknown.tar",
}
14 changes: 13 additions & 1 deletion tests/unit/metabolomics/test_gnps_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from nplinker.metabolomics.gnps import gnps_format_from_gnps1_task_id


#
# Test GNPS1 formats
#
@pytest.mark.parametrize(
"task_id, expected",
[
Expand All @@ -24,7 +27,7 @@ def test_gnps_format_from_gnps1_task_id(task_id: str, expected: GNPSFormat, gnps
@pytest.mark.parametrize(
"workflow", [GNPSFormat.FBMN, GNPSFormat.SNETS, GNPSFormat.SNETSV2, GNPSFormat.Unknown]
)
def test_gnps_format_from_archive(workflow: str, gnps_zip_files):
def test_gnps_format_from_archive_gnps1(workflow: str, gnps_zip_files):
actual = gnps_format_from_archive(gnps_zip_files[workflow])
assert actual is workflow

Expand All @@ -33,3 +36,12 @@ def test_gnps_format_from_archive(workflow: str, gnps_zip_files):
def test_gnps_format_from_file_mapping(workflow: str, gnps_file_mappings_files):
actual = gnps_format_from_file_mapping(gnps_file_mappings_files[workflow])
assert actual is workflow


#
# Test GNPS2 formats
#
@pytest.mark.parametrize("workflow", [GNPSFormat.GNPS2CN, GNPSFormat.GNPS2FBMN, GNPSFormat.Unknown])
def test_gnps_format_from_archive_gnps2(workflow: str, gnps2_tar_files):
actual = gnps_format_from_archive(gnps2_tar_files[workflow])
assert actual is workflow

0 comments on commit 817afa8

Please sign in to comment.