Skip to content

Commit

Permalink
refactor: create antismash downloader module and move there inherent …
Browse files Browse the repository at this point in the history
…code (#127)

* add module for antismash downloader

* create module specific for podp downloader

* move Downloader class module to inherent podp folder

* move antismash loader to antismash folder

* delete old antismash loader file

* update init files

* add tests

* add PathLike type hint

* commit suggested edits for antismash downloader module

* add suggestions to the antismash downloader test

* run yapf on antismash downloader files

* replace os.path.* with pathlib.Path

* move _check_roots func above _check_extract_path

---------

Co-authored-by: Cunliang Geng <[email protected]>
  • Loading branch information
gcroci2 and CunliangGeng authored Mar 20, 2023
1 parent b5fef0f commit 53a50c9
Show file tree
Hide file tree
Showing 9 changed files with 557 additions and 423 deletions.
4 changes: 3 additions & 1 deletion src/nplinker/genomics/antismash/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
from .antismash_downloader import download_and_extract_antismash_data
from .antismash_loader import AntismashBGCLoader
from .antismash_loader import parse_bgc_genbank


logging.getLogger(__name__).addHandler(logging.NullHandler())

__all__ = ["AntismashBGCLoader", "parse_bgc_genbank"]
__all__ = ["AntismashBGCLoader", "parse_bgc_genbank", "download_and_extract_antismash_data"]
84 changes: 84 additions & 0 deletions src/nplinker/genomics/antismash/antismash_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os
from os import PathLike
from pathlib import Path
import shutil
from nplinker.logconfig import LogConfig
from nplinker.utils import download_and_extract_archive
from nplinker.utils import list_dirs
from nplinker.utils import list_files

logger = LogConfig.getLogger(__name__)

# urls to be given to download antismash data
ANTISMASH_DB_PAGE_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/'
ANTISMASH_DB_DOWNLOAD_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/{}'

# The antiSMASH DBV2 is for the availability of the old version, better to keep it.
ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/'
ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}'


def download_and_extract_antismash_data(antismash_id: str,
download_root: str | PathLike,
extract_root: str | PathLike) -> None:
"""Download and extract antiSMASH BGC archive for a specified genome.
The antiSMASH database (https://antismash-db.secondarymetabolites.org/)
is used to download the BGC archive. And antiSMASH use RefSeq assembly id
of a genome as the id of the archive.
Args:
antismash_id(str): The id used to download BGC archive from antiSMASH database.
If the id is versioned (e.g., "GCF_004339725.1") please be sure to
specify the version as well.
download_root(str | PathLike): Path to the directory to place downloaded archive in.
extract_root(str | PathLike): Path to the directory data files will be extracted to.
Note that an `antismash` directory will be created in the specified `extract_root` if
it doesn't exist. The files will be extracted to `<extract_root>/antismash/<antismash_id>` directory.
Raises:
ValueError: if download_root and extract_root dirs are the same.
ValueError: if <extract_root>/antismash/<refseq_assembly_id> dir is not empty.
Examples:
>>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
"""
download_root = Path(download_root)
extract_root = Path(extract_root)
extract_path = extract_root / "antismash" / antismash_id
_check_roots(download_root, extract_root)
if extract_path.exists():
_check_extract_path(extract_path)
else:
extract_path.mkdir(parents=True, exist_ok=True)

for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:
url = base_url.format(antismash_id, antismash_id + '.zip')
download_and_extract_archive(url, download_root, extract_path,
antismash_id + '.zip')
break

# delete subdirs
for subdir_path in list_dirs(extract_path):
shutil.rmtree(subdir_path)

# delete unnecessary files
files_to_keep = list_files(extract_path, suffix=(".json", ".gbk"))
for file in list_files(extract_path):
if file not in files_to_keep:
os.remove(file)

logger.info('antiSMASH BGC data of %s is downloaded and extracted.',
antismash_id)


def _check_roots(download_root: PathLike, extract_root: PathLike):
if download_root == extract_root:
raise ValueError(
"Identical path of download directory and extract directory")


def _check_extract_path(extract_path: PathLike):
# check if extract_path is empty
if any(os.scandir(extract_path)):
raise ValueError(f'Nonempty directory: "{extract_path}"')
3 changes: 1 addition & 2 deletions src/nplinker/genomics/antismash/antismash_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,7 @@ def parse_bgc_genbank(file: str) -> BGC:
product_prediction = features.get("product")
if product_prediction is None:
raise ValueError(
"Not found product prediction in antiSMASH Genbank file {}".format(
file))
f"Not found product prediction in antiSMASH Genbank file {file}")

# init BGC
bgc = BGC(bgc_id=fname, product_prediction=product_prediction)
Expand Down
1 change: 1 addition & 0 deletions src/nplinker/pairedomics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .podp_antismash_downloader import download_antismash_data
Loading

0 comments on commit 53a50c9

Please sign in to comment.