diff --git a/proteinflow/__init__.py b/proteinflow/__init__.py index 1a6996f..c0fa988 100644 --- a/proteinflow/__init__.py +++ b/proteinflow/__init__.py @@ -243,6 +243,7 @@ def generate_data( sabdab_data_path=None, require_antigen=False, exclude_chains=None, + exclude_chains_file=None, exclude_threshold=0.7, exclude_clusters=False, exclude_based_on_cdr=None, @@ -326,6 +327,8 @@ def generate_data( if `True`, only use SAbDab files with an antigen exclude_chains : list of str, optional a list of chains (`{pdb_id}-{chain_id}`) to exclude from the splitting (e.g. `["1A2B-A", "1A2B-B"]`); chain id is the author chain id + exclude_chains_file : str, optional + path to a file containing the sequences to exclude, one sequence per line exclude_threshold : float in [0, 1], default 0.7 the sequence similarity threshold for excluding chains exclude_clusters : bool, default False @@ -421,6 +424,7 @@ def generate_data( ignore_existing=True, min_seq_id=min_seq_id, exclude_chains=exclude_chains, + exclude_chains_file=exclude_chains_file, exclude_threshold=exclude_threshold, exclude_clusters=exclude_clusters, exclude_based_on_cdr=exclude_based_on_cdr, diff --git a/proteinflow/cli.py b/proteinflow/cli.py index ded5ec0..f01cfe8 100644 --- a/proteinflow/cli.py +++ b/proteinflow/cli.py @@ -177,6 +177,34 @@ def download(**kwargs): is_flag=True, help="Whether or not to load ligands found in the pdbs example: data['A']['ligand'][0]['X']", ) +@click.option( + "--exclude_chains", + "-e", + multiple=True, + type=str, + help="Exclude specific chains from the dataset ({pdb_id}-{chain_id}, e.g. -e 1a2b-A)", +) +@click.option( + "--exclude_chains_file", + type=str, + help="Exclude specific chains from the dataset (path to a file containing the sequences to exclude, one sequence per line)", +) +@click.option( + "--exclude_threshold", + default=0.7, + type=float, + help="Exclude chains with sequence identity to exclude_chains above this threshold", +) +@click.option( + "--exclude_clusters", + is_flag=True, + help="Exclude clusters that contain chains similar to chains to exclude", +) +@click.option( + "--exclude_based_on_cdr", + type=click.Choice(["L1", "L2", "L3", "H1", "H2", "H3"]), + help="if given and exclude_clusters is true + the dataset is SAbDab, exclude files based on only the given CDR clusters", +) @click.option( "--exclude_chains_without_ligands", is_flag=True,