Skip to content

Commit

Permalink
refactor: add more splitting options to generate function
Browse files Browse the repository at this point in the history
  • Loading branch information
elkoz committed Sep 14, 2023
1 parent eb5e08d commit 58c1393
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 0 deletions.
4 changes: 4 additions & 0 deletions proteinflow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ def generate_data(
sabdab_data_path=None,
require_antigen=False,
exclude_chains=None,
exclude_chains_file=None,
exclude_threshold=0.7,
exclude_clusters=False,
exclude_based_on_cdr=None,
Expand Down Expand Up @@ -326,6 +327,8 @@ def generate_data(
if `True`, only use SAbDab files with an antigen
exclude_chains : list of str, optional
a list of chains (`{pdb_id}-{chain_id}`) to exclude from the splitting (e.g. `["1A2B-A", "1A2B-B"]`); chain id is the author chain id
exclude_chains_file : str, optional
path to a file containing the sequences to exclude, one sequence per line
exclude_threshold : float in [0, 1], default 0.7
the sequence similarity threshold for excluding chains
exclude_clusters : bool, default False
Expand Down Expand Up @@ -421,6 +424,7 @@ def generate_data(
ignore_existing=True,
min_seq_id=min_seq_id,
exclude_chains=exclude_chains,
exclude_chains_file=exclude_chains_file,
exclude_threshold=exclude_threshold,
exclude_clusters=exclude_clusters,
exclude_based_on_cdr=exclude_based_on_cdr,
Expand Down
28 changes: 28 additions & 0 deletions proteinflow/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,34 @@ def download(**kwargs):
is_flag=True,
help="Whether or not to load ligands found in the pdbs example: data['A']['ligand'][0]['X']",
)
@click.option(
"--exclude_chains",
"-e",
multiple=True,
type=str,
help="Exclude specific chains from the dataset ({pdb_id}-{chain_id}, e.g. -e 1a2b-A)",
)
@click.option(
"--exclude_chains_file",
type=str,
help="Exclude specific chains from the dataset (path to a file containing the sequences to exclude, one sequence per line)",
)
@click.option(
"--exclude_threshold",
default=0.7,
type=float,
help="Exclude chains with sequence identity to exclude_chains above this threshold",
)
@click.option(
"--exclude_clusters",
is_flag=True,
help="Exclude clusters that contain chains similar to chains to exclude",
)
@click.option(
"--exclude_based_on_cdr",
type=click.Choice(["L1", "L2", "L3", "H1", "H2", "H3"]),
help="if given and exclude_clusters is true + the dataset is SAbDab, exclude files based on only the given CDR clusters",
)
@click.option(
"--exclude_chains_without_ligands",
is_flag=True,
Expand Down

0 comments on commit 58c1393

Please sign in to comment.