refactor: create antismash downloader module and move there inherent …

…code (#127) * add module for antismash downloader * create module specific for podp downloader * move Downloader class module to inherent podp folder * move antismash loader to antismash folder * delete old antismash loader file * update init files * add tests * add PathLike type hint * commit suggested edits for antismash downloader module * add suggestions to the antismash downloader test * run yapf on antismash downloader files * replace os.path.* with pathlib.Path * move _check_roots func above _check_extract_path --------- Co-authored-by: Cunliang Geng <[email protected]>
NPLinker · Mar 20, 2023 · 53a50c9 · 53a50c9
1 parent b5fef0f
commit 53a50c9
Show file tree

Hide file tree

Showing 9 changed files with 557 additions and 423 deletions.
diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py
@@ -1,7 +1,9 @@
 import logging
+from .antismash_downloader import download_and_extract_antismash_data
 from .antismash_loader import AntismashBGCLoader
 from .antismash_loader import parse_bgc_genbank
 
+
 logging.getLogger(__name__).addHandler(logging.NullHandler())
 
-__all__ = ["AntismashBGCLoader", "parse_bgc_genbank"]
+__all__ = ["AntismashBGCLoader", "parse_bgc_genbank", "download_and_extract_antismash_data"]
diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -0,0 +1,84 @@
+import os
+from os import PathLike
+from pathlib import Path
+import shutil
+from nplinker.logconfig import LogConfig
+from nplinker.utils import download_and_extract_archive
+from nplinker.utils import list_dirs
+from nplinker.utils import list_files
+
+logger = LogConfig.getLogger(__name__)
+
+# urls to be given to download antismash data
+ANTISMASH_DB_PAGE_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/'
+ANTISMASH_DB_DOWNLOAD_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/{}'
+
+# The antiSMASH DBV2 is for the availability of the old version, better to keep it.
+ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/'
+ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}'
+
+
+def download_and_extract_antismash_data(antismash_id: str,
+                                        download_root: str | PathLike,
+                                        extract_root: str | PathLike) -> None:
+    """Download and extract antiSMASH BGC archive for a specified genome.
+
+    The antiSMASH database (https://antismash-db.secondarymetabolites.org/)
+    is used to download the BGC archive. And antiSMASH use RefSeq assembly id
+    of a genome as the id of the archive.
+
+    Args:
+        antismash_id(str): The id used to download BGC archive from antiSMASH database.
+            If the id is versioned (e.g., "GCF_004339725.1") please be sure to 
+            specify the version as well. 
+        download_root(str | PathLike): Path to the directory to place downloaded archive in.
+        extract_root(str | PathLike): Path to the directory data files will be extracted to.
+            Note that an `antismash` directory will be created in the specified `extract_root` if
+            it doesn't exist. The files will be extracted to `<extract_root>/antismash/<antismash_id>` directory.
+
+    Raises:
+        ValueError: if download_root and extract_root dirs are the same.
+        ValueError: if <extract_root>/antismash/<refseq_assembly_id> dir is not empty.
+
+    Examples:
+        >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
+    """
+    download_root = Path(download_root)
+    extract_root = Path(extract_root)
+    extract_path = extract_root / "antismash" / antismash_id
+    _check_roots(download_root, extract_root)
+    if extract_path.exists():
+        _check_extract_path(extract_path)
+    else:
+        extract_path.mkdir(parents=True, exist_ok=True)
+
+    for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:
+        url = base_url.format(antismash_id, antismash_id + '.zip')
+        download_and_extract_archive(url, download_root, extract_path,
+                                     antismash_id + '.zip')
+        break
+
+    # delete subdirs
+    for subdir_path in list_dirs(extract_path):
+        shutil.rmtree(subdir_path)
+
+    # delete unnecessary files
+    files_to_keep = list_files(extract_path, suffix=(".json", ".gbk"))
+    for file in list_files(extract_path):
+        if file not in files_to_keep:
+            os.remove(file)
+
+    logger.info('antiSMASH BGC data of %s is downloaded and extracted.',
+                antismash_id)
+
+
+def _check_roots(download_root: PathLike, extract_root: PathLike):
+    if download_root == extract_root:
+        raise ValueError(
+            "Identical path of download directory and extract directory")
+
+
+def _check_extract_path(extract_path: PathLike):
+    # check if extract_path is empty
+    if any(os.scandir(extract_path)):
+        raise ValueError(f'Nonempty directory: "{extract_path}"')
diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py
@@ -140,8 +140,7 @@ def parse_bgc_genbank(file: str) -> BGC:
     product_prediction = features.get("product")
     if product_prediction is None:
         raise ValueError(
-            "Not found product prediction in antiSMASH Genbank file {}".format(
-                file))
+            f"Not found product prediction in antiSMASH Genbank file {file}")
 
     # init BGC
     bgc = BGC(bgc_id=fname, product_prediction=product_prediction)

diff --git a/src/nplinker/pairedomics/__init__.py b/src/nplinker/pairedomics/__init__.py
@@ -0,0 +1 @@
+from .podp_antismash_downloader import download_antismash_data
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .podp_antismash_downloader import download_antismash_data