Skip to content

Commit

Permalink
Merge pull request #21 from pinellolab/dev
Browse files Browse the repository at this point in the history
v1.0
  • Loading branch information
jykr authored Mar 31, 2024
2 parents 8dce316 + f3a6932 commit 1b46538
Show file tree
Hide file tree
Showing 188 changed files with 8,641 additions and 1,935 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/CI.yml
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: '3.8'
python-version: '3.x'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install torch==1.12.1+cpu torchvision==0.13.1+cpu torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cpu
pip install torch torchvision torchaudio
pip install -r requirements.txt
pip install -e .
- name: Test with pytest
Expand Down
41 changes: 41 additions & 0 deletions .github/workflows/documentation.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: "Sphinx: Render docs"

on: push

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: '3.x'

- name: Install Sphinx & Dependencies
run: |
pip install sphinx sphinx_markdown_builder sphinx_rtd_theme sphinx-argparse m2r pandas bio
sudo apt-get install python3-distutils
- name: Build Documentation
working-directory: docs
run: sphinx-build . _build
- name: copy image files
run: cp -r docs/assets docs/_build/
- uses: actions/upload-pages-artifact@v3
with:
name: github-pages
path: docs/_build/

deploy:
needs: build
permissions:
id-token: write
pages: write
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
runs-on: ubuntu-latest
steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
*.pyc
*.fastq
Empty file modified LICENSE
100644 → 100755
Empty file.
Empty file modified MANIFEST.in
100644 → 100755
Empty file.
452 changes: 37 additions & 415 deletions README.md
100644 → 100755

Large diffs are not rendered by default.

Empty file modified bean/__init__.py
100644 → 100755
Empty file.
Empty file modified bean/annotate/__init__.py
100644 → 100755
Empty file.
17 changes: 10 additions & 7 deletions bean/annotate/_supporting_fn.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from copy import deepcopy
from typing import List, Tuple
from typing import List, Tuple, Union
from tqdm.auto import tqdm
from ..framework.Edit import Edit, Allele
from ..framework.AminoAcidEdit import CodingNoncodingAllele
Expand Down Expand Up @@ -43,14 +43,18 @@ def filter_allele_by_pos(
def filter_allele_by_base(
allele: Allele,
allowed_base_changes: List[Tuple] = None,
allowed_ref_base: str = None,
allowed_alt_base: str = None,
allowed_ref_base: Union[List, str] = None,
allowed_alt_base: Union[List, str] = None,
):
"""
Filter alleles based on position and return the filtered allele and
number of filtered edits.
"""
filtered_edits = 0
if isinstance(allowed_ref_base, str):
allowed_ref_base = [allowed_ref_base]
if isinstance(allowed_alt_base, str):
allowed_alt_base = [allowed_alt_base]
if (
not (allowed_ref_base is None and allowed_alt_base is None)
+ (allowed_base_changes is None)
Expand All @@ -64,15 +68,15 @@ def filter_allele_by_base(
allele.edits.remove(edit)
elif not allowed_ref_base is None:
for edit in allele.edits.copy():
if edit.ref_base != allowed_ref_base:
if edit.ref_base not in allowed_ref_base:
filtered_edits += 1
allele.edits.remove(edit)
elif not allowed_alt_base is None and edit.alt_base != allowed_alt_base:
elif not allowed_alt_base is None and edit.alt_base not in allowed_alt_base:
filtered_edits += 1
allele.edits.remove(edit)
else:
for edit in allele.edits.copy():
if edit.alt_base != allowed_alt_base:
if edit.alt_base not in allowed_alt_base:
filtered_edits += 1
allele.edits.remove(edit)
return (allele, filtered_edits)
Expand Down Expand Up @@ -145,7 +149,6 @@ def _map_alleles_to_filtered(
raw_allele_counts.groupby("guide"),
desc="Mapping alleles to closest filtered alleles",
):

guide_filtered_allele_counts = filtered_allele_counts.loc[
filtered_allele_counts.guide == guide, :
].set_index(allele_col)
Expand Down
Empty file modified bean/annotate/filter_alleles.py
100644 → 100755
Empty file.
Empty file modified bean/annotate/ldlr_exons.fa
100644 → 100755
Empty file.
97 changes: 72 additions & 25 deletions bean/annotate/translate_allele.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from doctest import set_unittest_reportflags
import os
from typing import List, Iterable, Dict, Tuple, Collection
from typing import List, Iterable, Dict, Tuple, Collection, Sequence
from copy import deepcopy
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -169,24 +170,24 @@ def get_cds_seq_pos_from_fasta(fasta_file_name: str) -> Tuple[List[str], List[in
return (exon_chrom, translated_seq, genomic_pos, strand)


def _translate_single_codon(nt_seq_string: str, aa_pos: int) -> str:
def _translate_single_codon(
codon: List[str],
) -> str: # nt_seq_string: str, aa_pos: int) -> str:
"""Translate `aa_pos`-th codon of `nt_seq_string`."""
codon = "".join(nt_seq_string[aa_pos * 3 : (aa_pos * 3 + 3)])
if len(codon) != 3:
print("reached the end of CDS, frameshift.")
return "/"
raise ValueError("reached the end of CDS, frameshift.")
return ">"
try:
codon = "".join(codon)
return codon_map[codon]
except KeyError:
if codon[-1] == "N" and codon[0] in BASE_SET and codon[1] in BASE_SET:
aa_set = {codon_map[codon[:2] + N] for N in BASE_SET}
if len(aa_set) == 1:
return next(iter(aa_set))
print(f"warning: no matching aa with codon {codon}")
raise ValueError(f"warning: no matching aa with codon {codon}")
else:
print(f"Cannot translate codon due to ambiguity: {codon}")

return "_"
raise ValueError(f"Cannot translate codon due to ambiguity: {codon}")


class CDS:
Expand All @@ -196,14 +197,22 @@ class CDS:
def __init__(self):
self.edited_aa_pos = set()
self.edits_noncoding = set()
self.edited_nt: List[str] = []
self.nt: List[str] = []
self.strand: int = 1
self.gene_name: str = ""
self.chrom: str = ""
self.translated_seq: List[str] = []
self.pos: np.ndarray = None
self.genomic_pos: Sequence = []

@classmethod
def from_fasta(cls, fasta_file_name, gene_name, suppressMessage=True):
cds = cls()
if fasta_file_name is None:
if not suppressMessage:
print("No fasta file provided as reference: using LDLR")
fasta_file_name = os.path.dirname(be.__file__) + "/annotate/ldlr_exons.fa"
# if fasta_file_name is not None:
# if not suppressMessage:
# raise ValueError("No fasta file provided as reference: using LDLR")
# fasta_file_name = os.path.dirname(be.__file__) + "/annotate/ldlr_exons.fa"
if gene_name not in cls.gene_info_dict:
chrom, translated_seq, genomic_pos, strand = get_cds_seq_pos_from_fasta(
fasta_file_name
Expand Down Expand Up @@ -242,28 +251,36 @@ def from_gene_name(cls, gene_name, ref_version: str = "GRCh38"):
cds.chrom = cls.gene_info_dict[gene_name]["chrom"]
cds.translated_seq = deepcopy(cls.gene_info_dict[gene_name]["translated_seq"])
cds.genomic_pos = cls.gene_info_dict[gene_name]["genomic_pos"]
cds.nt = cds.gene_info_dict[gene_name]["translated_seq"]
cds.nt = cds.gene_info_dict[gene_name]["translated_seq"] # in sense strand
# print(cds.gene_name + ":" + "".join(cds.nt))
# if cds.strand == -1:
# cds.nt = revcomp(cds.nt)
cds.pos = np.array(cds.genomic_pos)
cds.edited_nt = deepcopy(cds.nt)
return cds

def translate(self):
if self.strand == -1:
self.edited_nt = revcomp(self.edited_nt)
self.aa = _translate(self.edited_nt, codon_map)
# def translate(self):
# if self.strand == -1:
# self.edited_nt = revcomp(self.edited_nt)
# self.aa = _translate(self.edited_nt, codon_map)

def _get_relative_nt_pos(self, absolute_pos):
"""0-based relative position"""
nt_relative_pos = np.where(self.pos == absolute_pos)[0]
assert len(nt_relative_pos) <= 1, nt_relative_pos
return nt_relative_pos.astype(int).item() if nt_relative_pos else -1

def _edit_pos_to_aa_pos(self, edit_pos):
"""0-based nt position. Adds in sense direction, needs to be reversed for antisense gene"""
nt_relative_pos = self._get_relative_nt_pos(edit_pos)
if nt_relative_pos != -1:
self.edited_aa_pos.add(nt_relative_pos // 3)

return nt_relative_pos

def edit_single(self, edit_str):
"""Add a mutation induced by a single `edit_str` to CDS.
For the negative CDS, nt and edited_nt are antisense."""
edit = Edit.from_str(edit_str)
rel_pos = self._edit_pos_to_aa_pos(edit.pos)
if edit.strand == "-":
Expand All @@ -285,13 +302,16 @@ def edit_single(self, edit_str):
)
)
raise RefBaseMismatchException(
f"{self.gene_name + ';' if hasattr(self, 'gene_name') else ''}ref:{self.nt[rel_pos]} at pos {rel_pos}, got edit {edit}"
f"{self.gene_name + ';' if hasattr(self, 'gene_name') else ''}ref:{self.nt[rel_pos]} at pos {rel_pos}, got edit {edit}."
)
else:
self.edited_nt[rel_pos] = alt_base
if alt_base == "-": # frameshift
# self.edited_nt.pop(rel_pos)
self.edited_aa_pos.update(list(range(rel_pos, len(self.nt) // 3)))
if self.strand == 1:
self.edited_aa_pos.update(list(range(rel_pos, len(self.nt) // 3)))
else:
self.edited_aa_pos.update(list(range(0, rel_pos)))

def edit_allele(self, allele_str):
if isinstance(allele_str, Allele):
Expand All @@ -301,7 +321,13 @@ def edit_allele(self, allele_str):
for edit_str in edit_strs:
self.edit_single(edit_str)
if "-" in self.edited_nt:
self.edited_nt.remove("-")
self.edited_nt = [nt for nt in self.edited_nt if nt != "-"]
if self.strand == -1:
# Reverse logged edited positions as it was in the sense direction.
# rev_pos = self.edited_aa_pos[::-1]
self.edited_aa_pos = {len(self.nt) // 3 - 1 - r for r in self.edited_aa_pos}
self.nt = revcomp(self.nt)
self.edited_nt = revcomp(self.edited_nt)

def get_aa_change(
self, allele_str, include_synonymous=True
Expand All @@ -311,10 +337,31 @@ def get_aa_change(
mutations = CodingNoncodingAllele()
mutations.nt_allele.update(self.edits_noncoding)
for edited_aa_pos in self.edited_aa_pos:
ref_aa = _translate_single_codon(self.nt, edited_aa_pos)
mt_aa = _translate_single_codon(self.edited_nt, edited_aa_pos)
if mt_aa == "_":
return "translation error"
try:
# print("".join(self.nt))
# print((3 * edited_aa_pos), (3 * edited_aa_pos + 3))
ref_aa = _translate_single_codon(
self.nt[(3 * edited_aa_pos) : (3 * edited_aa_pos + 3)]
)
except ValueError as e:
print(
f"Translation mismatch in translating ref for {allele_str}: {e}. Check the input .fasta or genome version used for the reporter. Ignoring this allele."
)
continue
try:
# print("".join(self.nt))
# print(self.edited_nt[(3 * edited_aa_pos) : (3 * edited_aa_pos + 3)])
mt_aa = _translate_single_codon(
self.edited_nt[(3 * edited_aa_pos) : (3 * edited_aa_pos + 3)]
)
except ValueError as e:
print(
f"Translation mismatch in translating mutated gene for {allele_str}: {e}. Check the input .fasta or genome version used for the reporter. Ignoring this allele."
)
continue
except IndexError as e:
print(f"End of gene reached by frameshift {allele_str}: {e}")
mt_aa = ">"
if not include_synonymous and ref_aa == mt_aa:
continue
mutations.aa_allele.add(
Expand Down
40 changes: 20 additions & 20 deletions bean/annotate/utils.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
debug = logging.debug
info = logging.info

complement_base = {"A": "T", "T": "A", "C": "G", "G": "C"}
complement_base = {"A": "T", "T": "A", "C": "G", "G": "C", "-": "-"}


def revcomp(nt_list: List[str]):
Expand Down Expand Up @@ -97,7 +97,9 @@ def get_mane_transcript_id(gene_name: str):
return mane_transcript_id, id_version


def get_exons_from_transcript_id(transcript_id: str, id_version: int, ref_version: str = "GRCh38"):
def get_exons_from_transcript_id(
transcript_id: str, id_version: int, ref_version: str = "GRCh38"
):
"""
Retrieves the exons and the start position of the coding sequence (CDS) for a given transcript ID and version.
Expand All @@ -114,7 +116,9 @@ def get_exons_from_transcript_id(transcript_id: str, id_version: int, ref_versio
if transcript_json["count"] != 1:
if transcript_json["count"] > 1:
api_url = f"http://tark.ensembl.org/api/transcript/?stable_id={transcript_id}&stable_id_version={id_version}&assembly_name={ref_version}&expand=exons"
response = requests.get(api_url, headers={"Content-Type": "application/json"})
response = requests.get(
api_url, headers={"Content-Type": "application/json"}
)
transcript_json = response.json()
if transcript_json["count"] != 1:
raise ValueError(
Expand Down Expand Up @@ -214,22 +218,12 @@ def get_cds_seq_pos_from_gene_name(gene_name: str, ref_version: str = "GRCh38"):
return cds_chrom, cds_seq, cds_pos, strand


def parse_args():
"""Get the input arguments"""
print(
r"""
_ _
/ \ '\ __ _ _ _
| \ \ / _(_) | |_ ___ _ _
\ \ | | _| | | _/ -_) '_|
`.__|/ |_| |_|_|\__\___|_|
"""
)
print("bean-filter: filter alleles")
parser = argparse.ArgumentParser(
prog="allele_filter",
description="Filter alleles based on edit position in spacer and frequency across samples.",
)
def parse_args(parser=None):
if parser is None:
parser = argparse.ArgumentParser(
prog="allele_filter",
description="Filter alleles based on edit position in spacer and frequency across samples.",
)
parser.add_argument(
"bdata_path",
type=str,
Expand Down Expand Up @@ -276,6 +270,12 @@ def parse_args():
help="Only consider edit within window provided by (edit-start-pos, edit-end-pos). If this flag is not provided, `--edit-start-pos` and `--edit-end-pos` flags are ignored.",
action="store_true",
)
parser.add_argument(
"--keep-indels",
"-i",
help="Include indels.",
action="store_true",
)
parser.add_argument(
"--filter-target-basechange",
"-b",
Expand Down Expand Up @@ -339,7 +339,7 @@ def parse_args():
action="store_true",
help="Load temporary file and work from there.",
)
return parser.parse_args()
return parser


def check_args(args):
Expand Down
Empty file added bean/cli/__init__.py
Empty file.
Loading

0 comments on commit 1b46538

Please sign in to comment.