v1.2.0

- fixed contact filter for peptides, which was causing very short peptide binders to be rejected - avoid the saving of duplicate sequences during MPNN redesign, which is very unlikely but can happen when designing very short peptides, where multiple trajectories converge to the same sequence - added extensive checks for the installation script to make sure each step is completed fully before proceeding - removed default anaconda channel dependency - added libgfortran5 to installation requirements - added live trajectory and accepted design counters to the colab notebook - fixed hydrophobicity calculation for binder surface, there was a bug where the surface taken into account was from the whole complex instead of just the binder alone - colab target settings are now saved in the design output folder on Google drive and can be reloaded for continuing the design campaign - added options into settings_advanced jsons to manually set AF2 params directory, or dssp path or dalphaball path. If left empty, it will set the default installation paths - added more relaxed filter settings for normal proteins and peptides - added more advanced setting files allowing to redesign interface with MPNN, as well as increased flexibility of the target by masking the template sequence during design and reprediction - fixed mpnn sequence generation where batch size did not correspond to number of generated sequences
martinpacesa · Nov 6, 2024 · 1163713 · 1163713
1 parent b5e1bc1
commit 1163713
Show file tree

Hide file tree

Showing 25 changed files with 5,199 additions and 36,177 deletions.
diff --git a/README.md b/README.md
@@ -12,6 +12,8 @@ First you need to clone this repository. Replace **[install_folder]** with the p
 
 The navigate into your install folder using *cd* and run the installation code. BindCraft requires a CUDA-compatible Nvidia graphics card to run. In the *cuda* setting, please specify the CUDA version compatible with your graphics card, for example '11.8'. If unsure, leave blank but it's possible that the installation might select the wrong version, which will lead to errors. In *pkg_manager* specify whether you are using 'mamba' or 'conda', if left blank it will use 'conda' by default. 
 
+Note: This install script will install PyRosetta, which requires a license for commercial purposes.
+
 `bash install_bindcraft.sh --cuda '12.4' --pkg_manager 'conda'`
 
 ## Google Colab
@@ -39,16 +41,16 @@ number_of_final_designs   -> how many designs that pass all filters to aim for,
 ```
 Then run the binder design script:
 
-`sbatch ./bindcraft.slurm --settings './settings_target/PDL1.json' --filters './settings_filters/default_filters.json' --advanced './settings_advanced/4stage_multimer.json'`
+`sbatch ./bindcraft.slurm --settings './settings_target/PDL1.json' --filters './settings_filters/default_filters.json' --advanced './settings_advanced/default_4stage_multimer.json'`
 
-The *settings* flag should point to your target .json which you set above. The *filters* flag points to the json where the design filters are specified (default is ./filters/default_filters.json). The *advanced* flag points to your advanced settings (default is ./advanced_settings/4stage_multimer.json). If you leave out the filters and advanced settings flags it will automatically point to the defaults.
+The *settings* flag should point to your target .json which you set above. The *filters* flag points to the json where the design filters are specified (default is ./filters/default_filters.json). The *advanced* flag points to your advanced settings (default is ./advanced_settings/default_4stage_multimer.json). If you leave out the filters and advanced settings flags it will automatically point to the defaults.
 
 Alternatively, if your machine does not support SLURM, you can run the code directly by activating the environment in conda and running the python code:
 
 ```
 conda activate BindCraft
 cd /path/to/bindcraft/folder/
-python -u ./bindcraft.py --settings './settings_target/PDL1.json' --filters './settings_filters/default_filters.json' --advanced './settings_advanced/4stage_multimer.json'
+python -u ./bindcraft.py --settings './settings_target/PDL1.json' --filters './settings_filters/default_filters.json' --advanced './settings_advanced/default_4stage_multimer.json'
 ```
 
 **We recommend to generate at least a 100 final designs passing all filters, then order the top 5-20 for experimental characterisation.** If high affinity binders are required, it is better to screen more, as the ipTM metric used for ranking is not a good predictor for affinity, but has been shown to be a good binary predictor of binding. 

diff --git a/bindcraft.py b/bindcraft.py
@@ -15,7 +15,7 @@
                     help='Path to the basic settings.json file. Required.')
 parser.add_argument('--filters', '-f', type=str, default='./settings_filters/default_filters.json',
                     help='Path to the filters.json file used to filter design. If not provided, default will be used.')
-parser.add_argument('--advanced', '-a', type=str, default='./settings_advanced/4stage_multimer.json',
+parser.add_argument('--advanced', '-a', type=str, default='./settings_advanced/default_4stage_multimer.json',
                     help='Path to the advanced.json file with additional design settings. If not provided, default will be used.')
 
 args = parser.parse_args()
@@ -33,11 +33,9 @@
 ### load AF2 model settings
 design_models, prediction_models, multimer_validation = load_af2_models(advanced_settings["use_multimer_design"])
 
-### set package settings
+### perform checks on advanced_settings
 bindcraft_folder = os.path.dirname(os.path.realpath(__file__))
-advanced_settings["af_params_dir"] = bindcraft_folder
-advanced_settings["dssp_path"] = os.path.join(bindcraft_folder, 'functions/dssp')
-advanced_settings["dalphaball_path"] = os.path.join(bindcraft_folder, 'functions/DAlphaBall.gcc')
+advanced_settings = perform_advanced_settings_check(advanced_settings, bindcraft_folder)
 
 ### generate directories, design path names can be found within the function
 design_paths = generate_directories(target_settings["design_path"])
@@ -66,7 +64,6 @@
 script_start_time = time.time()
 trajectory_n = 1
 accepted_designs = 0
-rejected_designs = 0
 
 ### start design loop
 while True:
@@ -170,44 +167,47 @@
 
                 ### MPNN redesign of starting binder
                 mpnn_trajectories = mpnn_gen_sequence(trajectory_pdb, binder_chain, trajectory_interface_residues, advanced_settings)
+                existing_mpnn_sequences = set(pd.read_csv(mpnn_csv, usecols=['Sequence'])['Sequence'].values)
+
+                # create set of MPNN sequences with allowed amino acid composition
+                restricted_AAs = set(aa.strip().upper() for aa in advanced_settings["omit_AAs"].split(',')) if advanced_settings["force_reject_AA"] else set()
+
+                mpnn_sequences = sorted({
+                    mpnn_trajectories['seq'][n][-length:]: {
+                        'seq': mpnn_trajectories['seq'][n][-length:],
+                        'score': mpnn_trajectories['score'][n],
+                        'seqid': mpnn_trajectories['seqid'][n]
+                    } for n in range(advanced_settings["num_seqs"])
+                    if (not restricted_AAs or not any(aa in mpnn_trajectories['seq'][n][-length:].upper() for aa in restricted_AAs))
+                    and mpnn_trajectories['seq'][n][-length:] not in existing_mpnn_sequences
+                }.values(), key=lambda x: x['score'])
+
+                del existing_mpnn_sequences
+
+                # check whether any sequences are left after amino acid rejection and duplication check, and if yes proceed with prediction
+                if mpnn_sequences:
+                    # add optimisation for increasing recycles if trajectory is beta sheeted
+                    if advanced_settings["optimise_beta"] and float(trajectory_beta) > 15:
+                        advanced_settings["num_recycles_validation"] = advanced_settings["optimise_beta_recycles_valid"]
+
+                    ### Compile prediction models once for faster prediction of MPNN sequences
+                    clear_mem()
+                    # compile complex prediction model
+                    complex_prediction_model = mk_afdesign_model(protocol="binder", num_recycles=advanced_settings["num_recycles_validation"], data_dir=advanced_settings["af_params_dir"], 
+                                                                use_multimer=multimer_validation)
+                    complex_prediction_model.prep_inputs(pdb_filename=target_settings["starting_pdb"], chain=target_settings["chains"], binder_len=length, rm_target_seq=advanced_settings["rm_template_seq_predict"],
+                                                        rm_target_sc=advanced_settings["rm_template_sc_predict"])
+
+                    # compile binder monomer prediction model
+                    binder_prediction_model = mk_afdesign_model(protocol="hallucination", use_templates=False, initial_guess=False, 
+                                                                use_initial_atom_pos=False, num_recycles=advanced_settings["num_recycles_validation"], 
+                                                                data_dir=advanced_settings["af_params_dir"], use_multimer=multimer_validation)
+                    binder_prediction_model.prep_inputs(length=length)
+
+                    # iterate over designed sequences        
+                    for mpnn_sequence in mpnn_sequences:
+                        mpnn_time = time.time()
 
-                # whether to hard reject sequences with excluded amino acids
-                if advanced_settings["force_reject_AA"]:
-                    restricted_AAs = set(advanced_settings["omit_AAs"].split(','))
-                    mpnn_sequences = [{'seq': mpnn_trajectories['seq'][n][-length:], 'score': mpnn_trajectories['score'][n], 'seqid': mpnn_trajectories['seqid'][n]}
-                        for n in range(advanced_settings["num_seqs"])
-                        if not any(restricted_AA in mpnn_trajectories['seq'][n] for restricted_AA in restricted_AAs)]
-                else:
-                    mpnn_sequences = [{'seq': mpnn_trajectories['seq'][n][-length:], 'score': mpnn_trajectories['score'][n], 'seqid': mpnn_trajectories['seqid'][n]}
-                        for n in range(advanced_settings["num_seqs"])]
-
-                # sort MPNN sequences by lowest MPNN score
-                mpnn_sequences.sort(key=lambda x: x['score'])
-
-                # add optimisation for increasing recycles if trajectory is beta sheeted
-                if advanced_settings["optimise_beta"] and float(trajectory_beta) > 15:
-                    advanced_settings["num_recycles_validation"] = advanced_settings["optimise_beta_recycles_valid"]
-
-                ### Compile prediction models once for faster prediction of MPNN sequences
-                clear_mem()
-                # compile complex prediction model
-                complex_prediction_model = mk_afdesign_model(protocol="binder", num_recycles=advanced_settings["num_recycles_validation"], data_dir=advanced_settings["af_params_dir"], 
-                                                            use_multimer=multimer_validation)
-                complex_prediction_model.prep_inputs(pdb_filename=target_settings["starting_pdb"], chain=target_settings["chains"], binder_len=length, rm_target_seq=advanced_settings["rm_template_seq_predict"],
-                                                    rm_target_sc=advanced_settings["rm_template_sc_predict"])
-
-                # compile binder monomer prediction model
-                binder_prediction_model = mk_afdesign_model(protocol="hallucination", use_templates=False, initial_guess=False, 
-                                                            use_initial_atom_pos=False, num_recycles=advanced_settings["num_recycles_validation"], 
-                                                            data_dir=advanced_settings["af_params_dir"], use_multimer=multimer_validation)
-                binder_prediction_model.prep_inputs(length=length)
-
-                # iterate over designed sequences        
-                for mpnn_sequence in mpnn_sequences:
-                    mpnn_time = time.time()
-
-                    # compile sequences dictionary with scores and remove duplicate sequences
-                    if mpnn_sequence['seq'] not in [v['seq'] for v in mpnn_dict.values()]:
                         # generate mpnn design name numbering
                         mpnn_design_name = design_name + "_mpnn" + str(mpnn_n)
                         mpnn_score = round(mpnn_sequence['score'],2)
@@ -230,6 +230,7 @@
                         # if AF2 filters are not passed then skip the scoring
                         if not pass_af2_filters:
                             print(f"Base AF2 filters not passed for {mpnn_design_name}, skipping interface scoring")
+                            mpnn_n += 1
                             continue
 
                         # calculate statistics for each model individually
@@ -415,14 +416,17 @@
                         # if enough mpnn sequences of the same trajectory pass filters then stop
                         if accepted_mpnn >= advanced_settings["max_mpnn_sequences"]:
                             break
+
+                    if accepted_mpnn >= 1:
+                        print("Found "+str(accepted_mpnn)+" MPNN designs passing filters")
+                        print("")
                     else:
-                        print("Skipping duplicate sequence")
+                        print("No accepted MPNN designs found for this trajectory.")
+                        print("")
 
-                if accepted_mpnn >= 1:
-                    print("Found "+str(accepted_mpnn)+" MPNN designs passing filters")
                 else:
-                    print("No accepted MPNN designs found for this trajectory.")
-                    rejected_designs += 1
+                    print('Duplicate MPNN designs sampled with different trajectory, skipping current trajectory optimisation')
+                    print("")
 
                 # save space by removing unrelaxed design trajectory PDB
                 if advanced_settings["remove_unrelaxed_trajectory"]:
@@ -447,4 +451,4 @@
 ### Script finished
 elapsed_time = time.time() - script_start_time
 elapsed_text = f"{'%d hours, %d minutes, %d seconds' % (int(elapsed_time // 3600), int((elapsed_time % 3600) // 60), int(elapsed_time % 60))}"
-print("Finished all designs. Script execution for "+str(trajectory_n)+" trajectories took: "+elapsed_text)
+print("Finished all designs. Script execution for "+str(trajectory_n)+" trajectories took: "+elapsed_text)
diff --git a/bindcraft.slurm b/bindcraft.slurm
@@ -10,7 +10,9 @@
 #SBATCH --output=bindcraft_%A.log
 
 # Initialise environment and modules
-conda activate BindCraft
+CONDA_BASE=$(conda info --base)
+source ${CONDA_BASE}/bin/activate ${CONDA_BASE}/envs/BindCraft
+export LD_LIBRARY_PATH=${CONDA_BASE}/lib
 
 # alternatively you can source the environment directly
 #source /path/to/mambaforge/bin/activate /path/to/mambaforge/envs/BindCraft

diff --git a/functions/biopython_utils.py b/functions/biopython_utils.py
@@ -233,4 +233,4 @@ def calculate_percentages(total, helix, sheet):
     sheet_percentage = round((sheet / total) * 100,2) if total > 0 else 0
     loop_percentage = round(((total - helix - sheet) / total) * 100,2) if total > 0 else 0
 
-    return helix_percentage, sheet_percentage, loop_percentage
+    return helix_percentage, sheet_percentage, loop_percentage
diff --git a/functions/colabdesign_utils.py b/functions/colabdesign_utils.py
@@ -357,7 +357,7 @@ def mpnn_gen_sequence(trajectory_pdb, binder_chain, trajectory_interface_residue
     mpnn_model.prep_inputs(pdb_filename=trajectory_pdb, chain=design_chains, fix_pos=fixed_positions, rm_aa=advanced_settings["omit_AAs"])
 
     # sample MPNN sequences in parallel
-    mpnn_sequences = mpnn_model.sample_parallel(temperature=advanced_settings["sampling_temp"], num=advanced_settings["num_seqs"], batch=advanced_settings["sample_seq_parallel"])
+    mpnn_sequences = mpnn_model.sample(temperature=advanced_settings["sampling_temp"], num=advanced_settings["num_seqs"], batch=advanced_settings["num_seqs"])
 
     return mpnn_sequences
 
@@ -474,4 +474,4 @@ def plot_trajectory(af_model, design_name, design_paths):
             plt.savefig(os.path.join(design_paths["Trajectory/Plots"], design_name+"_"+metric+".png"), dpi=150)
 
             # Close the figure
-            plt.close()
+            plt.close()
diff --git a/functions/generic_utils.py b/functions/generic_utils.py
@@ -225,10 +225,35 @@ def perform_input_check(args):
 
     # Set a random advanced json settings file if not provided
     if not args.advanced:
-        args.advanced = os.path.join(binder_script_path, 'settings_advanced', '4stage_multimer.json')
+        args.advanced = os.path.join(binder_script_path, 'settings_advanced', 'default_4stage_multimer.json')
 
     return args.settings, args.filters, args.advanced
 
+# check specific advanced settings
+def perform_advanced_settings_check(advanced_settings, bindcraft_folder):
+    # set paths to model weights and executables
+    if bindcraft_folder == "colab":
+        advanced_settings["af_params_dir"] = '/content/bindcraft/params/'
+        advanced_settings["dssp_path"] = '/content/bindcraft/functions/dssp'
+        advanced_settings["dalphaball_path"] = '/content/bindcraft/functions/DAlphaBall.gcc'
+    else:
+        # Set paths individually if they are not already set
+        if not advanced_settings["af_params_dir"]:
+            advanced_settings["af_params_dir"] = bindcraft_folder
+        if not advanced_settings["dssp_path"]:
+            advanced_settings["dssp_path"] = os.path.join(bindcraft_folder, 'functions', 'dssp')
+        if not advanced_settings["dalphaball_path"]:
+            advanced_settings["dalphaball_path"] = os.path.join(bindcraft_folder, 'functions', 'DAlphaBall.gcc')
+
+    # check formatting of omit_AAs setting
+        omit_aas = advanced_settings["omit_AAs"]
+    if advanced_settings["omit_AAs"] in [None, False, '']:
+        advanced_settings["omit_AAs"] = None
+    elif isinstance(advanced_settings["omit_AAs"], str):
+        advanced_settings["omit_AAs"] = advanced_settings["omit_AAs"].strip()
+
+    return advanced_settings
+
 # Load settings from JSONs
 def load_json_settings(settings_json, filters_json, advanced_json):
     # load settings from json files

diff --git a/functions/pyrosetta_utils.py b/functions/pyrosetta_utils.py
@@ -99,17 +99,19 @@ def score_interface(pdb_file, binder_chain="B"):
         interface_binder_fraction = 0
 
     # calculate surface hydrophobicity
+    binder_pose = {pose.pdb_info().chain(pose.conformation().chain_begin(i)): p for i, p in zip(range(1, pose.num_chains()+1), pose.split_by_chain())}[binder_chain]
+
     layer_sel = pr.rosetta.core.select.residue_selector.LayerSelector()
     layer_sel.set_layers(pick_core = False, pick_boundary = False, pick_surface = True)
-    surface_res = layer_sel.apply(pose)
+    surface_res = layer_sel.apply(binder_pose)
 
     exp_apol_count = 0
     total_count = 0 
 
     # count apolar and aromatic residues at the surface
     for i in range(1, len(surface_res) + 1):
         if surface_res[i] == True:
-            res = pose.residue(i)
+            res = binder_pose.residue(i)
 
             # count apolar and aromatic residues as hydrophobic
             if res.is_apolar() == True or res.name() == 'PHE' or res.name() == 'TRP' or res.name() == 'TYR':