Merge pull request #21 from pinellolab/dev

v1.0
pinellolab · Mar 31, 2024 · 1b46538 · 1b46538
2 parents 8dce316 + f3a6932
commit 1b46538
Show file tree

Hide file tree

Showing 188 changed files with 8,641 additions and 1,935 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -23,11 +23,11 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v3
       with:
-        python-version: '3.8'
+        python-version: '3.x'
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install torch==1.12.1+cpu torchvision==0.13.1+cpu torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install torch torchvision torchaudio
         pip install -r requirements.txt
         pip install -e .
     - name: Test with pytest

diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -0,0 +1,41 @@
+name: "Sphinx: Render docs"
+
+on: push
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.x'
+
+    - name: Install Sphinx & Dependencies
+      run: |
+        pip install sphinx sphinx_markdown_builder sphinx_rtd_theme sphinx-argparse m2r pandas bio
+        sudo apt-get install python3-distutils
+    - name: Build Documentation
+      working-directory: docs
+      run: sphinx-build . _build
+    - name: copy image files
+      run: cp -r docs/assets docs/_build/
+    - uses: actions/upload-pages-artifact@v3
+      with:
+        name: github-pages
+        path: docs/_build/
+
+  deploy:
+    needs: build
+    permissions:
+      id-token: write
+      pages: write
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
 *.pyc
+*.fastq
diff --git a/LICENSE b/LICENSE
diff --git a/MANIFEST.in b/MANIFEST.in
diff --git a/README.md b/README.md
diff --git a/bean/__init__.py b/bean/__init__.py
diff --git a/bean/annotate/__init__.py b/bean/annotate/__init__.py
diff --git a/bean/annotate/_supporting_fn.py b/bean/annotate/_supporting_fn.py
@@ -1,5 +1,5 @@
 from copy import deepcopy
-from typing import List, Tuple
+from typing import List, Tuple, Union
 from tqdm.auto import tqdm
 from ..framework.Edit import Edit, Allele
 from ..framework.AminoAcidEdit import CodingNoncodingAllele
@@ -43,14 +43,18 @@ def filter_allele_by_pos(
 def filter_allele_by_base(
     allele: Allele,
     allowed_base_changes: List[Tuple] = None,
-    allowed_ref_base: str = None,
-    allowed_alt_base: str = None,
+    allowed_ref_base: Union[List, str] = None,
+    allowed_alt_base: Union[List, str] = None,
 ):
     """
     Filter alleles based on position and return the filtered allele and
     number of filtered edits.
     """
     filtered_edits = 0
+    if isinstance(allowed_ref_base, str):
+        allowed_ref_base = [allowed_ref_base]
+    if isinstance(allowed_alt_base, str):
+        allowed_alt_base = [allowed_alt_base]
     if (
         not (allowed_ref_base is None and allowed_alt_base is None)
         + (allowed_base_changes is None)
@@ -64,15 +68,15 @@ def filter_allele_by_base(
                 allele.edits.remove(edit)
     elif not allowed_ref_base is None:
         for edit in allele.edits.copy():
-            if edit.ref_base != allowed_ref_base:
+            if edit.ref_base not in allowed_ref_base:
                 filtered_edits += 1
                 allele.edits.remove(edit)
-            elif not allowed_alt_base is None and edit.alt_base != allowed_alt_base:
+            elif not allowed_alt_base is None and edit.alt_base not in allowed_alt_base:
                 filtered_edits += 1
                 allele.edits.remove(edit)
     else:
         for edit in allele.edits.copy():
-            if edit.alt_base != allowed_alt_base:
+            if edit.alt_base not in allowed_alt_base:
                 filtered_edits += 1
                 allele.edits.remove(edit)
     return (allele, filtered_edits)
@@ -145,7 +149,6 @@ def _map_alleles_to_filtered(
         raw_allele_counts.groupby("guide"),
         desc="Mapping alleles to closest filtered alleles",
     ):
-
         guide_filtered_allele_counts = filtered_allele_counts.loc[
             filtered_allele_counts.guide == guide, :
         ].set_index(allele_col)

diff --git a/bean/annotate/filter_alleles.py b/bean/annotate/filter_alleles.py
diff --git a/bean/annotate/ldlr_exons.fa b/bean/annotate/ldlr_exons.fa
diff --git a/bean/annotate/translate_allele.py b/bean/annotate/translate_allele.py
@@ -1,5 +1,6 @@
+from doctest import set_unittest_reportflags
 import os
-from typing import List, Iterable, Dict, Tuple, Collection
+from typing import List, Iterable, Dict, Tuple, Collection, Sequence
 from copy import deepcopy
 import numpy as np
 import pandas as pd
@@ -169,24 +170,24 @@ def get_cds_seq_pos_from_fasta(fasta_file_name: str) -> Tuple[List[str], List[in
     return (exon_chrom, translated_seq, genomic_pos, strand)
 
 
-def _translate_single_codon(nt_seq_string: str, aa_pos: int) -> str:
+def _translate_single_codon(
+    codon: List[str],
+) -> str:  # nt_seq_string: str, aa_pos: int) -> str:
     """Translate `aa_pos`-th codon of `nt_seq_string`."""
-    codon = "".join(nt_seq_string[aa_pos * 3 : (aa_pos * 3 + 3)])
     if len(codon) != 3:
-        print("reached the end of CDS, frameshift.")
-        return "/"
+        raise ValueError("reached the end of CDS, frameshift.")
+        return ">"
     try:
+        codon = "".join(codon)
         return codon_map[codon]
     except KeyError:
         if codon[-1] == "N" and codon[0] in BASE_SET and codon[1] in BASE_SET:
             aa_set = {codon_map[codon[:2] + N] for N in BASE_SET}
             if len(aa_set) == 1:
                 return next(iter(aa_set))
-            print(f"warning: no matching aa with codon {codon}")
+            raise ValueError(f"warning: no matching aa with codon {codon}")
         else:
-            print(f"Cannot translate codon due to ambiguity: {codon}")
-
-        return "_"
+            raise ValueError(f"Cannot translate codon due to ambiguity: {codon}")
 
 
 class CDS:
@@ -196,14 +197,22 @@ class CDS:
     def __init__(self):
         self.edited_aa_pos = set()
         self.edits_noncoding = set()
+        self.edited_nt: List[str] = []
+        self.nt: List[str] = []
+        self.strand: int = 1
+        self.gene_name: str = ""
+        self.chrom: str = ""
+        self.translated_seq: List[str] = []
+        self.pos: np.ndarray = None
+        self.genomic_pos: Sequence = []
 
     @classmethod
     def from_fasta(cls, fasta_file_name, gene_name, suppressMessage=True):
         cds = cls()
-        if fasta_file_name is None:
-            if not suppressMessage:
-                print("No fasta file provided as reference: using LDLR")
-            fasta_file_name = os.path.dirname(be.__file__) + "/annotate/ldlr_exons.fa"
+        # if fasta_file_name is not None:
+        #     if not suppressMessage:
+        #         raise ValueError("No fasta file provided as reference: using LDLR")
+        #     fasta_file_name = os.path.dirname(be.__file__) + "/annotate/ldlr_exons.fa"
         if gene_name not in cls.gene_info_dict:
             chrom, translated_seq, genomic_pos, strand = get_cds_seq_pos_from_fasta(
                 fasta_file_name
@@ -242,28 +251,36 @@ def from_gene_name(cls, gene_name, ref_version: str = "GRCh38"):
         cds.chrom = cls.gene_info_dict[gene_name]["chrom"]
         cds.translated_seq = deepcopy(cls.gene_info_dict[gene_name]["translated_seq"])
         cds.genomic_pos = cls.gene_info_dict[gene_name]["genomic_pos"]
-        cds.nt = cds.gene_info_dict[gene_name]["translated_seq"]
+        cds.nt = cds.gene_info_dict[gene_name]["translated_seq"]  # in sense strand
+        # print(cds.gene_name + ":" + "".join(cds.nt))
+        # if cds.strand == -1:
+        #     cds.nt = revcomp(cds.nt)
         cds.pos = np.array(cds.genomic_pos)
         cds.edited_nt = deepcopy(cds.nt)
         return cds
 
-    def translate(self):
-        if self.strand == -1:
-            self.edited_nt = revcomp(self.edited_nt)
-        self.aa = _translate(self.edited_nt, codon_map)
+    # def translate(self):
+    #     if self.strand == -1:
+    #         self.edited_nt = revcomp(self.edited_nt)
+    #     self.aa = _translate(self.edited_nt, codon_map)
 
     def _get_relative_nt_pos(self, absolute_pos):
+        """0-based relative position"""
         nt_relative_pos = np.where(self.pos == absolute_pos)[0]
         assert len(nt_relative_pos) <= 1, nt_relative_pos
         return nt_relative_pos.astype(int).item() if nt_relative_pos else -1
 
     def _edit_pos_to_aa_pos(self, edit_pos):
+        """0-based nt position. Adds in sense direction, needs to be reversed for antisense gene"""
         nt_relative_pos = self._get_relative_nt_pos(edit_pos)
         if nt_relative_pos != -1:
             self.edited_aa_pos.add(nt_relative_pos // 3)
+
         return nt_relative_pos
 
     def edit_single(self, edit_str):
+        """Add a mutation induced by a single `edit_str` to CDS.
+        For the negative CDS, nt and edited_nt are antisense."""
         edit = Edit.from_str(edit_str)
         rel_pos = self._edit_pos_to_aa_pos(edit.pos)
         if edit.strand == "-":
@@ -285,13 +302,16 @@ def edit_single(self, edit_str):
                     )
                 )
                 raise RefBaseMismatchException(
-                    f"{self.gene_name + ';' if hasattr(self, 'gene_name') else ''}ref:{self.nt[rel_pos]} at pos {rel_pos}, got edit {edit}"
+                    f"{self.gene_name + ';' if hasattr(self, 'gene_name') else ''}ref:{self.nt[rel_pos]} at pos {rel_pos}, got edit {edit}."
                 )
         else:
             self.edited_nt[rel_pos] = alt_base
         if alt_base == "-":  # frameshift
             # self.edited_nt.pop(rel_pos)
-            self.edited_aa_pos.update(list(range(rel_pos, len(self.nt) // 3)))
+            if self.strand == 1:
+                self.edited_aa_pos.update(list(range(rel_pos, len(self.nt) // 3)))
+            else:
+                self.edited_aa_pos.update(list(range(0, rel_pos)))
 
     def edit_allele(self, allele_str):
         if isinstance(allele_str, Allele):
@@ -301,7 +321,13 @@ def edit_allele(self, allele_str):
         for edit_str in edit_strs:
             self.edit_single(edit_str)
         if "-" in self.edited_nt:
-            self.edited_nt.remove("-")
+            self.edited_nt = [nt for nt in self.edited_nt if nt != "-"]
+        if self.strand == -1:
+            # Reverse logged edited positions as it was in the sense direction.
+            # rev_pos = self.edited_aa_pos[::-1]
+            self.edited_aa_pos = {len(self.nt) // 3 - 1 - r for r in self.edited_aa_pos}
+            self.nt = revcomp(self.nt)
+            self.edited_nt = revcomp(self.edited_nt)
 
     def get_aa_change(
         self, allele_str, include_synonymous=True
@@ -311,10 +337,31 @@ def get_aa_change(
         mutations = CodingNoncodingAllele()
         mutations.nt_allele.update(self.edits_noncoding)
         for edited_aa_pos in self.edited_aa_pos:
-            ref_aa = _translate_single_codon(self.nt, edited_aa_pos)
-            mt_aa = _translate_single_codon(self.edited_nt, edited_aa_pos)
-            if mt_aa == "_":
-                return "translation error"
+            try:
+                # print("".join(self.nt))
+                # print((3 * edited_aa_pos), (3 * edited_aa_pos + 3))
+                ref_aa = _translate_single_codon(
+                    self.nt[(3 * edited_aa_pos) : (3 * edited_aa_pos + 3)]
+                )
+            except ValueError as e:
+                print(
+                    f"Translation mismatch in translating ref for {allele_str}: {e}. Check the input .fasta or genome version used for the reporter. Ignoring this allele."
+                )
+                continue
+            try:
+                # print("".join(self.nt))
+                # print(self.edited_nt[(3 * edited_aa_pos) : (3 * edited_aa_pos + 3)])
+                mt_aa = _translate_single_codon(
+                    self.edited_nt[(3 * edited_aa_pos) : (3 * edited_aa_pos + 3)]
+                )
+            except ValueError as e:
+                print(
+                    f"Translation mismatch in translating mutated gene for {allele_str}: {e}. Check the input .fasta or genome version used for the reporter. Ignoring this allele."
+                )
+                continue
+            except IndexError as e:
+                print(f"End of gene reached by frameshift {allele_str}: {e}")
+                mt_aa = ">"
             if not include_synonymous and ref_aa == mt_aa:
                 continue
             mutations.aa_allele.add(

diff --git a/bean/annotate/utils.py b/bean/annotate/utils.py
@@ -20,7 +20,7 @@
 debug = logging.debug
 info = logging.info
 
-complement_base = {"A": "T", "T": "A", "C": "G", "G": "C"}
+complement_base = {"A": "T", "T": "A", "C": "G", "G": "C", "-": "-"}
 
 
 def revcomp(nt_list: List[str]):
@@ -97,7 +97,9 @@ def get_mane_transcript_id(gene_name: str):
     return mane_transcript_id, id_version
 
 
-def get_exons_from_transcript_id(transcript_id: str, id_version: int, ref_version: str = "GRCh38"):
+def get_exons_from_transcript_id(
+    transcript_id: str, id_version: int, ref_version: str = "GRCh38"
+):
     """
     Retrieves the exons and the start position of the coding sequence (CDS) for a given transcript ID and version.
 
@@ -114,7 +116,9 @@ def get_exons_from_transcript_id(transcript_id: str, id_version: int, ref_versio
     if transcript_json["count"] != 1:
         if transcript_json["count"] > 1:
             api_url = f"http://tark.ensembl.org/api/transcript/?stable_id={transcript_id}&stable_id_version={id_version}&assembly_name={ref_version}&expand=exons"
-            response = requests.get(api_url, headers={"Content-Type": "application/json"})
+            response = requests.get(
+                api_url, headers={"Content-Type": "application/json"}
+            )
             transcript_json = response.json()
             if transcript_json["count"] != 1:
                 raise ValueError(
@@ -214,22 +218,12 @@ def get_cds_seq_pos_from_gene_name(gene_name: str, ref_version: str = "GRCh38"):
     return cds_chrom, cds_seq, cds_pos, strand
 
 
-def parse_args():
-    """Get the input arguments"""
-    print(
-        r"""
-    _ _         
-  /  \ '\       __ _ _ _           
-  |   \  \     / _(_) | |_ ___ _ _ 
-   \   \  |   |  _| | |  _/ -_) '_|
-    `.__|/    |_| |_|_|\__\___|_|  
-    """
-    )
-    print("bean-filter: filter alleles")
-    parser = argparse.ArgumentParser(
-        prog="allele_filter",
-        description="Filter alleles based on edit position in spacer and frequency across samples.",
-    )
+def parse_args(parser=None):
+    if parser is None:
+        parser = argparse.ArgumentParser(
+            prog="allele_filter",
+            description="Filter alleles based on edit position in spacer and frequency across samples.",
+        )
     parser.add_argument(
         "bdata_path",
         type=str,
@@ -276,6 +270,12 @@ def parse_args():
         help="Only consider edit within window provided by (edit-start-pos, edit-end-pos). If this flag is not provided, `--edit-start-pos` and `--edit-end-pos` flags are ignored.",
         action="store_true",
     )
+    parser.add_argument(
+        "--keep-indels",
+        "-i",
+        help="Include indels.",
+        action="store_true",
+    )
     parser.add_argument(
         "--filter-target-basechange",
         "-b",
@@ -339,7 +339,7 @@ def parse_args():
         action="store_true",
         help="Load temporary file and work from there.",
     )
-    return parser.parse_args()
+    return parser
 
 
 def check_args(args):

diff --git a/bean/cli/__init__.py b/bean/cli/__init__.py