Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: create antismash downloader module and move there inherent code #127

Merged
merged 24 commits into from
Mar 20, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
5428dca
add module for antismash downloader
gcroci2 Mar 15, 2023
8dc0f74
create module specific for podp downloader
gcroci2 Mar 15, 2023
3b5d5d9
move Downloader class module to inherent podp folder
gcroci2 Mar 15, 2023
899b92e
move antismash loader to antismash folder
gcroci2 Mar 15, 2023
0f4a68d
delete old antismash loader file
gcroci2 Mar 15, 2023
b944431
update init files
gcroci2 Mar 15, 2023
e5d891f
add tests
gcroci2 Mar 15, 2023
f27d56a
Merge branch 'dev' into 98_add_antismash_downloader_fixed_gcroci2
gcroci2 Mar 15, 2023
740f097
delete redundant line
gcroci2 Mar 16, 2023
1300cea
rename antismash files with unique names
gcroci2 Mar 16, 2023
1285b73
add PathLike type hint
gcroci2 Mar 16, 2023
1fdb1ee
commit suggested edits for antismash downloader module
gcroci2 Mar 16, 2023
5e1d745
add suggestions to the antismash downloader test
gcroci2 Mar 16, 2023
5dc1839
rerun yapf on antismash downloader files
gcroci2 Mar 16, 2023
3de49d5
replace os.path.* with pathlib.Path
gcroci2 Mar 17, 2023
2b7786e
Update src/nplinker/genomics/antismash/antismash_downloader.py
gcroci2 Mar 17, 2023
cf499b8
add suggestions to antismash_downloader
gcroci2 Mar 17, 2023
1fbb2cf
add last suggestions
gcroci2 Mar 17, 2023
e88b172
edit assert in test_antismash_downloader
gcroci2 Mar 17, 2023
88914f3
run yapf on test_antismash_downloader
gcroci2 Mar 17, 2023
aa91584
Update src/nplinker/genomics/antismash/antismash_downloader.py
gcroci2 Mar 20, 2023
ef740aa
Update src/nplinker/genomics/antismash/antismash_downloader.py
gcroci2 Mar 20, 2023
61b4810
Update src/nplinker/genomics/antismash/antismash_downloader.py
gcroci2 Mar 20, 2023
bab7aad
move _check_roots func above _check_extract_path
gcroci2 Mar 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/nplinker/genomics/antismash/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
from .antismash_downloader import download_and_extract_antismash_data
from .antismash_loader import AntismashBGCLoader
from .antismash_loader import parse_bgc_genbank


logging.getLogger(__name__).addHandler(logging.NullHandler())

__all__ = ["AntismashBGCLoader", "parse_bgc_genbank"]
__all__ = ["AntismashBGCLoader", "parse_bgc_genbank", "download_and_extract_antismash_data"]
87 changes: 87 additions & 0 deletions src/nplinker/genomics/antismash/antismash_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import os
from os import PathLike
from pathlib import Path
import shutil
from nplinker.logconfig import LogConfig
from nplinker.utils import download_and_extract_archive
from nplinker.utils import list_dirs
from nplinker.utils import list_files

logger = LogConfig.getLogger(__name__)

# urls to be given to download antismash data
ANTISMASH_DB_PAGE_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/'
ANTISMASH_DB_DOWNLOAD_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/{}'

# The antiSMASH DBV2 is for the availability of the old version, better to keep it.
ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/'
ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}'


def download_and_extract_antismash_data(antismash_id: str,
download_root: str | PathLike,
extract_root: str | PathLike):
"""Download and extract antiSMASH BGC archive for a specified genome.
The antiSMASH database (https://antismash-db.secondarymetabolites.org/)
is used to download the BGC archive. And antiSMASH use RefSeq assembly id
of a genome as the id of the archive.

Args:
antismash_id(str): The id used to download BGC archive from antiSMASH database.
If the id is versioned (e.g., "GCF_004339725.1") please be sure to
specify the version as well.
download_root(str | PathLike): Path to the directory to place downloaded archive in.
extract_root(str | PathLike): Path to the directory data files will be extracted to.
Note that an `antismash` directory will be created in the specified `extract_root` if
it doesn't exist. The files will be extracted to `<extract_root>/antismash/<antismash_id>` directory.

Raises:
ValueError: if download_root and extract_root dirs are the same.
ValueError: if <extract_root>/antismash/<refseq_assembly_id> dir is not empty.

Examples:
>>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
"""
download_root: Path = Path(download_root)
extract_root: Path = Path(extract_root)

extract_path = extract_root / "antismash" / antismash_id

_check_roots(download_root, extract_root)
_check_extract_path(extract_path)

for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:
url = base_url.format(antismash_id, antismash_id + '.zip')

download_and_extract_archive(url, download_root, extract_path,
antismash_id + '.zip')
break

# delete subdirs
subdirs = list_dirs(extract_path)
for subdir_path in subdirs:
shutil.rmtree(subdir_path)

files_to_keep = list_files(extract_path, suffix=(".json", ".gbk"))

for file in list_files(extract_path):
if file not in files_to_keep:
os.remove(file)
logger.info('antiSMASH BGC data of %s is downloaded and extracted.',
antismash_id)


def _check_roots(download_root: str | PathLike, extract_root: str | PathLike):
if download_root == extract_root:
raise ValueError(
"Identical path of download directory and extract directory")


def _check_extract_path(extract_path: str | PathLike):
if os.path.exists(extract_path):
# check if extract_path is empty
files = os.listdir(extract_path)
if len(files) != 0:
raise ValueError(f'Nonempty directory: "{extract_path}"')
else:
os.makedirs(extract_path, exist_ok=True)
3 changes: 1 addition & 2 deletions src/nplinker/genomics/antismash/antismash_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,7 @@ def parse_bgc_genbank(file: str) -> BGC:
product_prediction = features.get("product")
if product_prediction is None:
raise ValueError(
"Not found product prediction in antiSMASH Genbank file {}".format(
file))
f"Not found product prediction in antiSMASH Genbank file {file}")

# init BGC
bgc = BGC(bgc_id=fname, product_prediction=product_prediction)
Expand Down
1 change: 1 addition & 0 deletions src/nplinker/pairedomics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .podp_antismash_downloader import download_antismash_data
Loading