Skip to content

Commit

Permalink
draft multi-gene translation with gene names
Browse files Browse the repository at this point in the history
  • Loading branch information
jykr committed Oct 3, 2023
1 parent d808555 commit cd8ed45
Show file tree
Hide file tree
Showing 8 changed files with 175 additions and 206 deletions.
15 changes: 11 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,13 +187,20 @@ Above command produces

## `bean-filter`: Filtering (and optionally translating) alleles
As `tiling` mode of `bean-run` accounts for any robustly observed alleles, `bean-filter` filters for such alleles.
```
bean-filter my_sorting_screen_masked.h5ad -o my_sorting_screen_filtered.h5ad
```bash
bean-filter my_sorting_screen_masked.h5ad \
-o my_sorting_screen_filtered.h5ad `# Output file path` \
```

If you want to obtain **amino acid level variant** for coding sequence tiling screens, provide coding sequence positions which variants occuring within the coding sequence will be translated.
```
bean-filter my_sorting_screen.h5ad -o my_sorting_screen_masked.h5ad --translate [--translate-fasta gene_exon.fa, --translate-fastas-csv gene_exon_fas.csv]
```bash
bean-filter my_sorting_screen.h5ad \
-o my_sorting_screen_masked.h5ad \
--translate `# Translate coding variants` \
[ --translate-gene-name GENE_SYMBOL OR
--translate-gene-names path_to_gene_names_file.txt OR
--translate-fasta gene_exon.fa, OR
--translate-fastas-csv gene_exon_fas.csv]
```
* When library covers single gene: Feed `--translate --translate-fasta gene_exon.fa` argument where `gene_exon.fa` is the FASTA file with exon sequence and range (following `range=` tag in the FASTA header line)
* When library covers multiple genes: Feed `--translate --translate-fastas-csv gene_exon_fas.csv` where `gene_exon_fas.csv` is the csv file with lines `gene_id,gene_exon_fasta_path` without header.
Expand Down
2 changes: 1 addition & 1 deletion bean/annotate/filter_alleles.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ def _map_alleles_to_filtered(
# prioritize allele with most mean counts
merge_priority = guide_filtered_allele_counts.mean(
axis=1, numeric_only=True
)
).values
if is_cn_allele:
guide_raw_counts["allele_mapped"] = guide_raw_counts[allele_col].map(
lambda allele: allele.map_to_closest(
Expand Down
176 changes: 140 additions & 36 deletions bean/annotate/translate_allele.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
AminoAcidEdit,
CodingNoncodingAllele,
)
from bean.annotate.utils import get_cds_seq_pos_from_gene_name
import logging
import sys

Expand Down Expand Up @@ -179,24 +180,39 @@ def _translate_single_codon(nt_seq_string: str, aa_pos: int) -> str:


class CDS:
def __init__(self, fasta_file_name=None, suppressMessage=True):
def __init__(self):
self.edited_aa_pos = set()
self.edits_noncoding = set()

@classmethod
def from_fasta(cls, fasta_file_name, suppressMessage=True):
cds = cls()
if fasta_file_name is None:
if not suppressMessage:
print("No fasta file provided as reference: using LDLR")
fasta_file_name = os.path.dirname(be.__file__) + "/annotate/ldlr_exons.fa"
try:
self.set_exon_fasta_name(fasta_file_name)
cds.set_exon_fasta_name(fasta_file_name)
except FileNotFoundError as e:
print(os.getcwd())
raise e
self.edited_nt = self.nt.copy()
self.edited_aa_pos = set()
self.edits_noncoding = set()
self.set_exon_fasta_name(fasta_file_name)
cds.edited_nt = cds.nt.copy()
return cds

@classmethod
def from_gene_name(cls, gene_name):
cds = cls()
cds.chrom, cds.translated_seq, cds.genomic_pos = get_cds_seq_pos_from_gene_name(
gene_name
)
cds.nt = cds.translated_seq
cds.pos = np.array(cds.genomic_pos)

def set_exon_fasta_name(self, fasta_file_name: str):
self.fasta_file_name = fasta_file_name
self.chrom, self.translated_seq, self.genomic_pos = _get_seq_pos_from_fasta(fasta_file_name)
self.chrom, self.translated_seq, self.genomic_pos = _get_seq_pos_from_fasta(
fasta_file_name
)
self.nt = self.translated_seq
self.pos = np.array(self.genomic_pos)

Expand Down Expand Up @@ -246,7 +262,9 @@ def edit_allele(self, allele_str):
if "-" in self.edited_nt:
self.edited_nt.remove("-")

def get_aa_change(self, allele_str, include_synonymous=True) -> CodingNoncodingAllele:
def get_aa_change(
self, allele_str, include_synonymous=True
) -> CodingNoncodingAllele:
"""1-based amino acid editing result"""
self.edit_allele(allele_str)
mutations = CodingNoncodingAllele()
Expand All @@ -261,9 +279,25 @@ def get_aa_change(self, allele_str, include_synonymous=True) -> CodingNoncodingA
mutations.aa_allele.add(AminoAcidEdit(edited_aa_pos + 1, ref_aa, mt_aa))
return mutations


def get_cds_dict(gene_names):
return {gname: CDS.from_gene_name(gname) for gname in gene_names}


class CDSCollection:
def __init__(self, gene_ids: List[str] = None, fasta_file_names : List[str] = None, suppressMessage=True):
if len(gene_ids) != len(fasta_file_names):
"""
Represents a collection of coding sequences (CDS) for multiple genes.
"""

def __init__(
self,
gene_ids: List[str] = None,
fasta_file_names: List[str] = None,
suppressMessage=True,
):
if fasta_file_names is None:
self.cds_dict = get_cds_dict(gene_ids)
elif len(gene_ids) != len(fasta_file_names):
raise ValueError("gene_ids and fasta_file_names have different lengths")
self.cds_dict = {}
for gid, fasta_file in zip(fasta_file_names, gene_ids):
Expand All @@ -290,52 +324,118 @@ def get_cds_ranges(self):
index=gids,
)

def find_overlap(self, chrom: str, start: int, end: int, range_df: pd.DataFrame) -> Optional[str]:
"""Find overlap between query range and range_df and return ID of overlapping region in range_df.
"""
if not chrom in range_df.chrom: return None
overlap = range_df.loc[(range_df.chrom == chrom) & ((start <= range_df.end) | (end >= range_df.start))]
if len(overlap) == 0: return None
if len(overlap) > 2: raise ValueError("We cannot handle overlapping genes.")
def find_overlap(
self, chrom: str, start: int, end: int, range_df: pd.DataFrame
) -> Optional[str]:
"""Find overlap between query range and range_df and return ID of overlapping region in range_df."""
if not chrom in range_df.chrom:
return None
overlap = range_df.loc[
(range_df.chrom == chrom)
& ((start <= range_df.end) | (end >= range_df.start))
]
if len(overlap) == 0:
return None
if len(overlap) > 2:
raise ValueError("We cannot handle overlapping genes.")
return overlap.index.tolist()[0]

def get_aa_change(self, allele: Allele, include_synonymous: bool=True) -> CodingNoncodingAllele:
def get_aa_change(
self, allele: Allele, include_synonymous: bool = True
) -> CodingNoncodingAllele:
"""Finds overlapping CDS and call the same function for the CDS, else return CodingNonCodingAllele with no translated allele."""
chrom, start, end = allele.get_range()
overlapping_cds = find_overlap(chrom, start, end, self.cds_ranges)
if overlapping_cds:
return self.cds_dict[overlapping_cds].get_aa_change(allele, include_synonymous)
if overlapping_cds:
return self.cds_dict[overlapping_cds].get_aa_change(
allele, include_synonymous
)
else:
return CodingNonCodingAllele.from_alleles(nt_allele = allele)
return CodingNonCodingAllele.from_alleles(nt_allele=allele)


def get_allele_aa_change_single_gene(allele: Allele , fasta_file=None, include_synonymous=True):
def get_allele_aa_change_single_gene(
allele: Allele, gene_name=None, fasta_file=None, include_synonymous=True
):
"""
Obtain amino acid changes
"""
cds = CDS(fasta_file_name=fasta_file)
if gene_name is None and fasta_file is not None:
cds = CDS.from_fasta(fasta_file)
elif gene_name is not None and fasta_file is None:
cds = CDS.from_gene_name(gene_name)
else:
raise ValueError("Only one of `gene_name` or `fasta_file` should be provided.")
return cds.get_aa_change(allele, include_synonymous)

def get_allele_aa_change_multi_genes(allele: Allele, fasta_file_dict, include_synonymous=True):

def get_allele_aa_change_multi_genes(
allele: Allele, gene_names=None, fasta_file_dict=None, include_synonymous=True
):
"""
Obtain amino acid changes for multiple provided gene exon sequences
"""
cdss = CDSCollection(gene_ids = fasta_file_dict.keys(), fasta_file_list = fasta_file_dict.values())
return cdss.get_aa_change(allele_str, include_synonymous)
if gene_names is None and fasta_file_dict is not None:
cdss = CDSCollection(
gene_names=fasta_file_dict.keys(), fasta_file_list=fasta_file_dict.values()
)
elif gene_names is not None and fasta_file_dict is None:
cdss = CDSCollection(gene_names=gene_names)
else:
raise ValueError(
"Only one of `gene_names` or `fasta_file_dict` should be provided."
)
return cdss.get_aa_change(allele, include_synonymous)


def translate_allele(
allele: Allele, include_synonymouns=True, allow_ref_mismatch=True, fasta_file: str=None, fasta_file_dict: Dict[str, str] = None,
allele: Allele,
include_synonymous=True,
allow_ref_mismatch=True,
gene_name: str = None,
gene_names: List[str] = None,
fasta_file: str = None,
fasta_file_dict: Dict[str, str] = None,
):
try:
if fasta_file_dict:
if gene_name:
if (
gene_names is not None
or (fasta_file is not None)
or (fasta_file_dict is not None)
):
raise ValueError(
"Only one of `gene_name`, `gene_names`, `fasta_file`, `fasta_file_dict` can be provided as argument."
)
return get_allele_aa_change_single_gene(
allele,
gene_name=gene_name,
include_synonymous=include_synonymous,
)
elif gene_names:
if (fasta_file is not None) or (fasta_file_dict is not None):
raise ValueError(
"Only one of `gene_name`, `gene_names`, `fasta_file`, `fasta_file_dict` can be provided as argument."
)
return get_allele_aa_change_multi_genes(
allele,
fasta_file_dict = fasta_file_dict,
include_synonymous = include_synonymous,
gene_names=fasta_file_dict,
include_synonymous=include_synonymous,
)
return get_allele_aa_change_single_gene(
allele,
fasta_file=fasta_file,
include_synonymous=include_synonymouns,
elif fasta_file:
if fasta_file_dict is not None:
raise ValueError(
"Only one of `gene_name`, `gene_names`, `fasta_file`, `fasta_file_dict` can be provided as argument."
)
return get_allele_aa_change_single_gene(
allele,
fasta_file=fasta_file,
include_synonymous=include_synonymous,
)
return get_allele_aa_change_multi_genes(
allele,
fasta_file_dict=fasta_file_dict,
include_synonymous=include_synonymous,
)
except RefBaseMismatchException as e:
if not allow_ref_mismatch:
Expand All @@ -345,9 +445,13 @@ def translate_allele(


def translate_allele_df(
allele_df, include_synonymouns=True, allow_ref_mismatch=True, fasta_file=None, fasta_file_dict: Dict[str, str] = None,
allele_df,
include_synonymouns=True,
allow_ref_mismatch=True,
fasta_file=None,
fasta_file_dict: Dict[str, str] = None,
):
if fasta_file and fasta_file_dict:
if fasta_file and fasta_file_dict:
raise ValueError("Both fasta file and fasta file dict provided.")
allele_df = allele_df.copy()
translated_alleles = allele_df.allele.map(
Expand Down
4 changes: 3 additions & 1 deletion bean/framework/Edit.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,9 @@ def map_to_closest(
raise ValueError(
f"merge_priority length {len(merge_priority)} is not the same as allele_list length {len(allele_list)}"
)
nt_max_idx = nt_max_idx[np.argmax(merge_priority[nt_max_idx])]
nt_max_idx = nt_max_idx[
np.argmax(merge_priority[nt_max_idx], skipna=True)
]
else:
nt_max_idx = nt_max_idx[0]
if nt_jaccards[nt_max_idx] > jaccard_threshold:
Expand Down
15 changes: 12 additions & 3 deletions bean/mapping/GuideEditCounter.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,11 +341,15 @@ def _count_guide_edits(
self, matched_guide_idx, R1_record: SeqIO.SeqRecord, single_base_qual_cutoff=30
):
if self.guides_has_strands:
strand = self.screen.guides.strand[matched_guide_idx]
strand = self.screen.guides.strand.iloc[matched_guide_idx]
guide_strand = strand_str_to_int.get(strand, 1)
else:
guide_strand = 1
ref_guide_seq = self.screen.guides.sequence[matched_guide_idx]
ref_guide_seq = self.screen.guides.sequence.iloc[matched_guide_idx]
if "chrom" in self.screen.guides.columns.tolist():
chrom = self.screen.guides.chrom.iloc[matched_guide_idx]
else:
chrom = None
read_guide_seq, read_guide_qual = self.get_guide_seq_qual(
R1_record, len(ref_guide_seq)
)
Expand All @@ -357,6 +361,7 @@ def _count_guide_edits(
aln_mat_path=self.output_dir + "/.aln_mat.txt",
offset=0,
strand=guide_strand,
chrom=chrom,
start_pos=0,
end_pos=len(ref_guide_seq),
positionwise_quality=np.array(read_guide_qual),
Expand Down Expand Up @@ -454,7 +459,10 @@ def _count_reporter_edits(
guide_strand, offset = self._get_strand_offset_from_guide_index(
matched_guide_idx
)

if "chrom" in self.screen.guides.columns.tolist():
chrom = self.screen.guides.chrom.iloc[matched_guide_idx]
else:
chrom = None
allele, score = _get_edited_allele_crispresso(
ref_seq=ref_reporter_seq,
query_seq=read_reporter_seq,
Expand All @@ -463,6 +471,7 @@ def _count_reporter_edits(
aln_mat_path=self.output_dir + "/.aln_mat.txt",
offset=offset,
strand=guide_strand,
chrom=chrom,
positionwise_quality=np.array(read_reporter_qual),
quality_thres=single_base_qual_cutoff,
objectify_allele=self.objectify_allele,
Expand Down
4 changes: 4 additions & 0 deletions bean/mapping/_supporting_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ def _get_allele_from_alignment(
strand: int,
start_pos: int,
end_pos: int,
chrom: str = None,
positionwise_quality: np.ndarray = None,
quality_thres: float = -1,
):
Expand Down Expand Up @@ -210,6 +211,7 @@ def _get_allele_from_alignment(
if alt_base_is_good_quality and ref_pos >= start_pos and ref_pos < end_pos:
allele.add(
Edit(
chrom=chrom,
rel_pos=ref_pos,
ref_base=ref_base,
alt_base=alt_base,
Expand All @@ -228,6 +230,7 @@ def _get_edited_allele_crispresso(
aln_mat_path: str,
offset: int,
strand: int = 1,
chrom: str = None,
start_pos: int = 0,
end_pos: int = 100,
positionwise_quality: np.ndarray = None,
Expand Down Expand Up @@ -255,6 +258,7 @@ def _get_edited_allele_crispresso(
strand,
start_pos,
end_pos,
chrom,
positionwise_quality,
quality_thres,
)
Expand Down
2 changes: 1 addition & 1 deletion bean/mapping/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def _get_input_parser():
)
parser.add_argument(
"-q",
"--min_average_read_quality",
"--min-average-read-quality",
type=int,
help="Minimum average quality score (phred33) to keep a read",
default=30,
Expand Down
Loading

0 comments on commit cd8ed45

Please sign in to comment.