From 6d7f543dda3b6e4f05145fa48daecc21df7c1c52 Mon Sep 17 00:00:00 2001
From: jwohlwend <jeremy.wohlwend@gmail.com>
Date: Wed, 20 Nov 2024 22:44:47 -0500
Subject: [PATCH] clarify docs

---
 docs/prediction.md | 5 +++--
 src/boltz/main.py  | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/prediction.md b/docs/prediction.md
index 4cc185a..9527dc3 100644
--- a/docs/prediction.md
+++ b/docs/prediction.md
@@ -47,7 +47,7 @@ constraints:
         binder: CHAIN_ID
         contacts: [[CHAIN_ID, RES_IDX], [CHAIN_ID, RES_IDX]]
 ```
-`sequences` has one entry for every unique chain/molecule in the input. Each polymer entity as a `ENTITY_TYPE`  either `protein`, `dna` or`rna` and have a `sequence` attribute. Non-polymer entities are indicated by `ENTITY_TYPE` equal to `ligand` and have a `smiles` or `ccd` attribute. `CHAIN_ID` is the unique identifier for each chain/molecule, and it should be set as a list in case of multiple identical entities in the structure. For proteins, the `msa` key is optional. If unset, MSA's will be automatically generated using the mmseqs2 server. If you wish to use a precomputed MSA, use the `msa` attribute with `MSA_PATH` indicating the path to the `.a3m` file containing the MSA for that protein. If you wish to explicitly run single sequence mode (which is generally advised against as it will hurt model performance), you may do so by using the special keyword `empty` for that protein (ex: `msa: empty`).
+`sequences` has one entry for every unique chain/molecule in the input. Each polymer entity as a `ENTITY_TYPE`  either `protein`, `dna` or`rna` and have a `sequence` attribute. Non-polymer entities are indicated by `ENTITY_TYPE` equal to `ligand` and have a `smiles` or `ccd` attribute. `CHAIN_ID` is the unique identifier for each chain/molecule, and it should be set as a list in case of multiple identical entities in the structure. For proteins, the `msa` key is required by default but can be ommited by passing the `--use_msa_server` flag which will auto-generate the MSA using the mmseqs2 server. If you wish to use a precomputed MSA, use the `msa` attribute with `MSA_PATH` indicating the path to the `.a3m` file containing the MSA for that protein. If you wish to explicitly run single sequence mode (which is generally advised against as it will hurt model performance), you may do so by using the special keyword `empty` for that protein (ex: `msa: empty`).
 
 The `modifications` field is an optional field that allows you to specify modified residues in the polymer (`protein`, `dna` or`rna`). The `position` field specifies the index (starting from 1) of the residue, and `ccd` is the CCD code of the modified residue. This field is currently only supported for CCD ligands.
 
@@ -80,7 +80,7 @@ The fasta format is a little simpler, and should contain entries as follows:
 SEQUENCE
 ```
 
-The `CHAIN_ID` is a unique identifier for each input chain. The `ENTITY_TYPE` can be one of `protein`, `dna`, `rna`, `smiles`, `ccd` (note that we support both smiles and CCD code for ligands). The `MSA_PATH` is optional, and only applicable to proteins. By default, MSA's are auto-generated using the mmseqs2 server. If you wish to use a custom MSA, use it toset path to the `.a3m` file containing a pre-computed MSA for this protein. If you wish to explicitly run single sequence mode (which is generally advised against as it will hurt model performance), you may do so by using the special keyword `empty` for that protein (ex: `>A|protein|empty`).
+The `CHAIN_ID` is a unique identifier for each input chain. The `ENTITY_TYPE` can be one of `protein`, `dna`, `rna`, `smiles`, `ccd` (note that we support both smiles and CCD code for ligands). The `MSA_PATH` is only applicable to proteins. By default, MSA's are required, but they can be ommited by passing the `--use_msa_server` flag which will auto-generate the MSA using the mmseqs2 server. If you wish to use a custom MSA, use it toset path to the `.a3m` file containing a pre-computed MSA for this protein. If you wish to explicitly run single sequence mode (which is generally advised against as it will hurt model performance), you may do so by using the special keyword `empty` for that protein (ex: `>A|protein|empty`).
 
 For each of these cases, the corresponding `SEQUENCE` will contain an amino acid sequence (e.g. `EFKEAFSLF`), a sequence of nucleotide bases (e.g. `ATCG`), a smiles string (e.g. `CC1=CC=CC=C1`), or a CCD code (e.g. `ATP`), depending on the entity.
 
@@ -121,6 +121,7 @@ The following options are available for the `predict` command:
 | `--output_format`           | `[pdb,mmcif]`   | `mmcif`            | The output format to use for the predictions.                                   |
 | `--num_workers INTEGER`     | `INTEGER`       | `2`                | The number of dataloader workers to use for prediction.                         |
 | `--override`                | `FLAG`          | `False`            | Whether to override existing predictions if found.                              |
+| `--use_msa_server`                | `FLAG`          | `False`            | Whether to use the msa server to generate msa's.
 
 ## Output
 
diff --git a/src/boltz/main.py b/src/boltz/main.py
index a4daa2b..cb2a14c 100644
--- a/src/boltz/main.py
+++ b/src/boltz/main.py
@@ -260,8 +260,9 @@ def process_inputs(  # noqa: C901, PLR0912, PLR0915
             # Add to generate list, assigning entity id
             if (chain.mol_type == prot_id) and (chain.msa_id == 0):
                 entity_id = chain.entity_id
-                to_generate[entity_id] = target.sequences[entity_id]
-                chain.msa_id = msa_dir / f"{target_id}_{entity_id}.a3m"
+                msa_id = f"{target_id}_{entity_id}"
+                to_generate[msa_id] = target.sequences[entity_id]
+                chain.msa_id = msa_dir / f"{msa_id}.a3m"
 
             # We do not support msa generation for non-protein chains
             elif chain.msa_id == 0: