From d146afa9dab6eb4b9adf4f6587630493ecd737db Mon Sep 17 00:00:00 2001 From: Liza Kozlova Date: Wed, 15 Nov 2023 15:45:05 +0000 Subject: [PATCH 1/3] refactor: make a processing extra with optional dependencies --- install_optional.sh | 4 +--- proteinflow/__init__.py | 1 + proteinflow/data/__init__.py | 25 ++++++++++++++++---- proteinflow/extra.py | 41 +++++++++++++++++++++++++++++++++ proteinflow/metrics/__init__.py | 22 +++++++++++++++--- proteinflow/visualize.py | 6 ++--- pyproject.toml | 17 ++++++++++---- 7 files changed, 97 insertions(+), 19 deletions(-) create mode 100644 proteinflow/extra.py diff --git a/install_optional.sh b/install_optional.sh index 5b58e49..28b0e65 100644 --- a/install_optional.sh +++ b/install_optional.sh @@ -8,7 +8,5 @@ python -m pip install "fair-esm[esmfold]" python -m pip install 'dllogger @ git+https://github.com/NVIDIA/dllogger.git' python -m pip install 'openfold @ git+https://github.com/aqlaboratory/openfold.git@4b41059694619831a7db195b7e0988fc4ff3a307' -python -m pip install ablang igfold immunebuilder - python -m pip install -e . -python -m pip install ipykernel \ No newline at end of file +# python -m pip install ipykernel \ No newline at end of file diff --git a/proteinflow/__init__.py b/proteinflow/__init__.py index 9344021..abf911b 100644 --- a/proteinflow/__init__.py +++ b/proteinflow/__init__.py @@ -171,6 +171,7 @@ "split": False, "cli": False, "ligand": False, + "extra": False, } __docformat__ = "numpy" diff --git a/proteinflow/data/__init__.py b/proteinflow/data/__init__.py index 5a7e837..dff49ae 100644 --- a/proteinflow/data/__init__.py +++ b/proteinflow/data/__init__.py @@ -18,18 +18,31 @@ from collections import defaultdict import Bio.PDB -import MDAnalysis as mda import numpy as np import pandas as pd -import py3Dmol from Bio import pairwise2 from biopandas.pdb import PandasPdb -from methodtools import lru_cache from torch import Tensor, from_numpy +try: + import MDAnalysis as mda +except ImportError: + pass +try: + from methodtools import lru_cache +except ImportError: + + def lru_cache(): + """Make a dummy decorator.""" + + def wrapper(func): + return func + + return wrapper + + from proteinflow.constants import ( _PMAP, - ACCENT_COLOR, ALPHABET, ALPHABET_REVERSE, ATOM_MASKS, @@ -52,6 +65,7 @@ _retrieve_chain_names, ) from proteinflow.download import download_fasta, download_pdb +from proteinflow.extra import _get_view, requires_extra from proteinflow.ligand import _get_ligands from proteinflow.metrics import ( ablang_pll, @@ -1979,6 +1993,7 @@ def align_structure(self, reference_pdb_path, save_pdb_path, chain_ids=None): io.save(save_pdb_path) @staticmethod + @requires_extra("mda", install_name="MDAnalysis") def combine_multiple_frames(files, output_path="combined.pdb"): """Combine multiple PDB files into a single multiframe PDB file. @@ -2570,7 +2585,7 @@ def visualize( accent_color=accent_color, ) vis_string = "".join([str(x) for x in outstr]) - view = py3Dmol.view(width=canvas_size[0], height=canvas_size[1]) + view = _get_view(canvas_size) view.addModelsAsFrames(vis_string) for i, at in enumerate(outstr): view.setStyle( diff --git a/proteinflow/extra.py b/proteinflow/extra.py new file mode 100644 index 0000000..c894c8b --- /dev/null +++ b/proteinflow/extra.py @@ -0,0 +1,41 @@ +"""Handling optional dependencies.""" + +try: + import py3Dmol +except ImportError: + pass + +import sys + + +def requires_extra(module_name, install_name=None): + """Generate a decorator to require an optional dependency for the given function. + + Parameters + ---------- + module_name : str + Name of the module to check for + install_name : str, optional + Name of the module to install if it is not found. If not specified, `module_name` is used + + """ + if install_name is None: + install_name = module_name + + def decorator(func): + def wrapper(*args, **kwargs): + if module_name not in sys.modules: + raise ImportError( + f"{install_name} must be installed to use this function. " + f"Install it with `pip install {install_name}` or together with most other optional dependencies with `pip install proteinflow[processing]`." + ) + return func(*args, **kwargs) + + return wrapper + + return decorator + + +@requires_extra("py3Dmol") +def _get_view(canvas_size): + return py3Dmol.view(width=canvas_size[0], height=canvas_size[1]) diff --git a/proteinflow/metrics/__init__.py b/proteinflow/metrics/__init__.py index fc02841..1d94d95 100644 --- a/proteinflow/metrics/__init__.py +++ b/proteinflow/metrics/__init__.py @@ -2,16 +2,23 @@ import os -import Bio.PDB import biotite.structure.io as bsio import blosum as bl -import esm import numpy as np import torch -from tmtools import tm_align from torch.nn import functional as F from tqdm import tqdm +from proteinflow.extra import requires_extra + +try: + import esm +except ImportError: + pass +try: + from tmtools import tm_align +except ImportError: + pass try: import ablang except ImportError: @@ -78,6 +85,7 @@ def long_repeat_num(seq, thr=5): return count +@requires_extra("esm", install_name="fair-esm") def _get_esm_model(esm_model_name): """Get ESM model, batch converter and tok_to_idx dictionary.""" model_dict = { @@ -96,6 +104,7 @@ def _get_esm_model(esm_model_name): return esm_model, batch_converter, tok_to_idx +@requires_extra("ablang") def ablang_pll( sequence, predict_mask, @@ -149,6 +158,7 @@ def ablang_pll( return pll +@requires_extra("esm", install_name="fair-esm") def esm_pll( chain_sequences, predict_masks, @@ -229,6 +239,7 @@ def ca_rmsd(coordinates1, coordinates2): return np.sqrt(((coordinates1 - coordinates2) ** 2).sum(axis=-1).mean()) +@requires_extra("tmtools") def tm_score(coordinates1, coordinates2, sequence1, sequence2): """Calculate TM-score between two structures. @@ -253,6 +264,9 @@ def tm_score(coordinates1, coordinates2, sequence1, sequence2): return (res.tm_norm_chain1 + res.tm_norm_chain2) / 2 +requires_extra("esm", install_name="fair-esm[esmfold]") + + def esmfold_generate(sequences, filepaths=None): """Generate PDB structures using ESMFold. @@ -286,6 +300,7 @@ def esmfold_generate(sequences, filepaths=None): f.write(output) +@requires_extra("igfold") def igfold_generate(sequence_dicts, filepaths=None, use_openmm=False): """Generate PDB structures using IgFold. @@ -320,6 +335,7 @@ def igfold_generate(sequence_dicts, filepaths=None, use_openmm=False): ) +@requires_extra("ImmuneBuilder") def immunebuilder_generate(sequence_dicts, filepaths=None, protein_type="antibody"): """Generate PDB structures using ImmuneBuilder. diff --git a/proteinflow/visualize.py b/proteinflow/visualize.py index 07083f1..e05c5f5 100644 --- a/proteinflow/visualize.py +++ b/proteinflow/visualize.py @@ -3,9 +3,9 @@ import string import numpy as np -import py3Dmol from proteinflow.data import PDBEntry, ProteinEntry +from proteinflow.extra import _get_view def show_animation_from_pdb( @@ -55,7 +55,7 @@ def show_animation_from_pdb( models += "".join([str(x) for x in atoms]) models += "ENDMDL\n" - view = py3Dmol.view(width=canvas_size[0], height=canvas_size[1]) + view = _get_view(canvas_size) view.addModelsAsFrames(models) for i, at in enumerate(atoms): @@ -116,7 +116,7 @@ def show_animation_from_pickle( models += "".join([str(x) for x in atoms]) models += "ENDMDL\n" - view = py3Dmol.view(width=canvas_size[0], height=canvas_size[1]) + view = _get_view(canvas_size) view.addModelsAsFrames(models) for i, at in enumerate(atoms): diff --git a/pyproject.toml b/pyproject.toml index ac764df..1b1f267 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,17 +36,24 @@ dependencies = [ "pypdb", "prody", "joblib", - "methodtools", - "py3Dmol", - "tmtools", - "fair-esm", - "MDAnalysis", ] keywords = ["bioinformatics", "dataset", "protein", "PDB", "deep learning", "antibody"] [project.scripts] proteinflow = "proteinflow.cli:cli" +[project.optional-dependencies] +processing = [ + "py3Dmol", + "methodtools", + "tmtools", + "fair-esm", + "MDAnalysis", + "ablang", + "igfold", + "immunebuilder", +] + [tool.setuptools.packages] find = {} From 6835a5a69f97b6770d806e354e9e4ff638422d9c Mon Sep 17 00:00:00 2001 From: Liza Kozlova Date: Wed, 15 Nov 2023 15:57:28 +0000 Subject: [PATCH 2/3] fix: some package names + move blosum to optional --- .conda/arm64/meta.yaml | 10 +++++++--- proteinflow/data/__init__.py | 2 +- proteinflow/metrics/__init__.py | 10 ++++++---- pyproject.toml | 3 +-- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/.conda/arm64/meta.yaml b/.conda/arm64/meta.yaml index 3ccae7e..ba341a7 100644 --- a/.conda/arm64/meta.yaml +++ b/.conda/arm64/meta.yaml @@ -30,7 +30,7 @@ requirements: - Biopython==1.79 - click>=8.1.3 - biopandas>=0.4.1 - - boto3==1.24.59 + - boto3 - p-tqdm - networkx==2.8.8 - einops @@ -41,8 +41,12 @@ requirements: - awscli==1.25.60 - bs4 - rcsbsearch - - tmtools - - fair-esm + - pyyaml + - rdkit + - pypdb + - prody + + about: home: https://github.com/adaptyvbio/ProteinFlow diff --git a/proteinflow/data/__init__.py b/proteinflow/data/__init__.py index dff49ae..680fa0a 100644 --- a/proteinflow/data/__init__.py +++ b/proteinflow/data/__init__.py @@ -1993,7 +1993,7 @@ def align_structure(self, reference_pdb_path, save_pdb_path, chain_ids=None): io.save(save_pdb_path) @staticmethod - @requires_extra("mda", install_name="MDAnalysis") + @requires_extra("MDAnalysis") def combine_multiple_frames(files, output_path="combined.pdb"): """Combine multiple PDB files into a single multiframe PDB file. diff --git a/proteinflow/metrics/__init__.py b/proteinflow/metrics/__init__.py index 1d94d95..4b83e09 100644 --- a/proteinflow/metrics/__init__.py +++ b/proteinflow/metrics/__init__.py @@ -3,7 +3,6 @@ import os import biotite.structure.io as bsio -import blosum as bl import numpy as np import torch from torch.nn import functional as F @@ -11,6 +10,10 @@ from proteinflow.extra import requires_extra +try: + import blosum as bl +except ImportError: + pass try: import esm except ImportError: @@ -33,6 +36,7 @@ pass +@requires_extra("blosum") def blosum62_score(seq_before, seq_after): """Calculate the BLOSUM62 score between two sequences. @@ -264,9 +268,7 @@ def tm_score(coordinates1, coordinates2, sequence1, sequence2): return (res.tm_norm_chain1 + res.tm_norm_chain2) / 2 -requires_extra("esm", install_name="fair-esm[esmfold]") - - +@requires_extra("esm", install_name="fair-esm[esmfold]") def esmfold_generate(sequences, filepaths=None): """Generate PDB structures using ESMFold. diff --git a/pyproject.toml b/pyproject.toml index 1b1f267..dbf15c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,11 +30,9 @@ dependencies = [ "bs4>=0.0.1", "pyyaml>=6", "rcsbsearch", - "blosum>=2.0", "pre-commit", "rdkit", "pypdb", - "prody", "joblib", ] keywords = ["bioinformatics", "dataset", "protein", "PDB", "deep learning", "antibody"] @@ -52,6 +50,7 @@ processing = [ "ablang", "igfold", "immunebuilder", + "blosum>=2.0", ] [tool.setuptools.packages] From e86ed1c4c2a529247a822d7c9ad3229bd745e5ce Mon Sep 17 00:00:00 2001 From: Liza Kozlova Date: Wed, 15 Nov 2023 16:29:08 +0000 Subject: [PATCH 3/3] build: update conda dependencies --- .conda/arm64/meta.yaml | 6 +-- .conda/default/meta.yaml | 14 ++++--- channeldata.json | 39 +++++++++++++++++ index.html | 90 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 139 insertions(+), 10 deletions(-) create mode 100644 channeldata.json create mode 100644 index.html diff --git a/.conda/arm64/meta.yaml b/.conda/arm64/meta.yaml index ba341a7..500f569 100644 --- a/.conda/arm64/meta.yaml +++ b/.conda/arm64/meta.yaml @@ -25,7 +25,7 @@ requirements: - wheel - setuptools run: - - numpy>=1.17 + - numpy>=1.21,<1.25 - editdistance>=0.6.0 - Biopython==1.79 - click>=8.1.3 @@ -44,9 +44,7 @@ requirements: - pyyaml - rdkit - pypdb - - prody - - + - joblib about: home: https://github.com/adaptyvbio/ProteinFlow diff --git a/.conda/default/meta.yaml b/.conda/default/meta.yaml index ec132c5..cebb308 100644 --- a/.conda/default/meta.yaml +++ b/.conda/default/meta.yaml @@ -25,26 +25,28 @@ requirements: - wheel - setuptools run: - - numpy>=1.17 + - numpy>=1.21,<1.25 - editdistance>=0.6.0 - Biopython==1.79 - click>=8.1.3 - biopandas>=0.4.1 - - boto3==1.24.59 + - boto3= - p-tqdm - networkx==2.8.8 - einops - pandas - pytorch>=1.10.0 - biotite==0.35.0 - - aiobotocore==2.4.2 - - awscli==1.25.60 + - aiobotocore + - awscli - bs4 - rcsbsearch - - tmtools - - fair-esm - mmseqs2 - foldseek + - pyyaml + - rdkit + - pypdb + - joblib about: home: https://github.com/adaptyvbio/ProteinFlow diff --git a/channeldata.json b/channeldata.json new file mode 100644 index 0000000..d4798e0 --- /dev/null +++ b/channeldata.json @@ -0,0 +1,39 @@ +{ + "channeldata_version": 1, + "packages": { + "proteinflow": { + "activate.d": false, + "binary_prefix": false, + "deactivate.d": false, + "description": null, + "dev_url": "https://github.com/adaptyvbio/ProteinFlow", + "doc_source_url": null, + "doc_url": "https://adaptyvbio.github.io/ProteinFlow/", + "home": "https://github.com/adaptyvbio/ProteinFlow", + "icon_hash": null, + "icon_url": null, + "identifiers": null, + "keywords": null, + "license": "BSD-3-Clause", + "post_link": false, + "pre_link": false, + "pre_unlink": false, + "recipe_origin": null, + "run_exports": {}, + "source_git_url": "https://github.com/adaptyvbio/ProteinFlow.git", + "source_url": null, + "subdirs": [ + "linux-64" + ], + "summary": "Versatile pipeline for processing protein structure data for deep learning applications.", + "tags": null, + "text_prefix": true, + "timestamp": 1700064405, + "version": "2.4.1" + } + }, + "subdirs": [ + "linux-64", + "noarch" + ] +} diff --git a/index.html b/index.html new file mode 100644 index 0000000..3e4df35 --- /dev/null +++ b/index.html @@ -0,0 +1,90 @@ + + + proteinflow + + + +

proteinflow

+

RSS Feed   channeldata.json

+linux-64   noarch    + + + + + + + + + + + + + + + +
PackageLatest VersionDocDevLicenselinux-64noarch Summary
proteinflow2.4.1docdevBSD-3-ClauseX Versatile pipeline for processing protein structure data for deep...
+
Updated: 2023-11-15 16:08:06 +0000 - Files: 1
+ + \ No newline at end of file