Skip to content

Commit

Permalink
refining exon selections with assembly version
Browse files Browse the repository at this point in the history
  • Loading branch information
jykr committed Dec 4, 2023
1 parent d7f16d4 commit a7bf84b
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 14 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ bean-profile my_sorting_screen.h5ad -o output_prefix `# Prefix for editing profi
### Output
Above command produces `prefix_editing_preference.[html,ipynb]` as editing preferences ([see example](notebooks/profile_editing_preference.ipynb)).

<img src="imgs/profile_output.png" alt="Allele translation" width="700" style="background-color:white;"/>
<img src="imgs/profile_output.png" alt="Allele translation" width="700" />

### Parameters
* `-o`, `--output-prefix` (default: `None`): Output prefix of editing pattern report (prefix.html, prefix.ipynb). If not provided, base name of `bdata_path` is used.
Expand All @@ -188,7 +188,7 @@ bean-qc \

`bean-qc` supports following quality control and masks samples with low quality. Specifically:

<img src="imgs/qc_output.png" alt="Allele translation" width="900" style="background-color:white;"/>
<img src="imgs/qc_output.png" alt="Allele translation" width="900"/>

* Plots guide coverage and the uniformity of coverage
* Guide count correlation between samples
Expand Down Expand Up @@ -402,5 +402,5 @@ Python package `bean` supports multiple data wrangling functionalities for `Repo
* Full pipeline takes 90.1s in GitHub Action for toy dataset of 2 replicates and 30 guides.
## Citation
If you have used BEAN, please cite:
If you have used BEAN for your analysis, please cite:
Ryu, J. et al. Joint genotypic and phenotypic outcome modeling improves base editing variant effect quantification. medRxiv (2023) doi:10.1101/2023.09.08.23295253
11 changes: 6 additions & 5 deletions bean/annotate/translate_allele.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,11 +225,11 @@ def from_fasta(cls, fasta_file_name, gene_name, suppressMessage=True):
return cds

@classmethod
def from_gene_name(cls, gene_name):
def from_gene_name(cls, gene_name, ref_version: str = "GRCh38"):
cds = cls()
if gene_name not in cls.gene_info_dict:
chrom, translated_seq, genomic_pos, strand = get_cds_seq_pos_from_gene_name(
gene_name
gene_name, ref_version
)
cls.gene_info_dict[gene_name] = {
"chrom": chrom,
Expand Down Expand Up @@ -329,8 +329,8 @@ def get_mismatch_df():
).drop_duplicates()


def get_cds_dict(gene_names):
return {gname: CDS.from_gene_name(gname) for gname in gene_names}
def get_cds_dict(gene_names, ref_version: str = "GRCh38"):
return {gname: CDS.from_gene_name(gname, ref_version) for gname in gene_names}


def export_gene_info_to_json(gene_dict, write_path=".tmp_gene_info.csv"):
Expand Down Expand Up @@ -418,14 +418,15 @@ def get_allele_aa_change_single_gene(
fasta_file: str = None,
fasta_file_id: str = None,
include_synonymous: bool = True,
ref_version: str = "GRCh38",
):
"""
Obtain amino acid changes
"""
if gene_name is None and fasta_file is not None:
cds = CDS.from_fasta(fasta_file, fasta_file_id)
elif gene_name is not None and fasta_file is None:
cds = CDS.from_gene_name(gene_name)
cds = CDS.from_gene_name(gene_name, ref_version)
else:
raise ValueError("Only one of `gene_name` or `fasta_file` should be provided.")
return cds.get_aa_change(allele, include_synonymous)
Expand Down
21 changes: 15 additions & 6 deletions bean/annotate/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def get_mane_transcript_id(gene_name: str):
return mane_transcript_id, id_version


def get_exons_from_transcript_id(transcript_id: str, id_version: int):
def get_exons_from_transcript_id(transcript_id: str, id_version: int, ref_version: str = "GRCh38"):
"""
Retrieves the exons and the start position of the coding sequence (CDS) for a given transcript ID and version.
Expand All @@ -112,9 +112,18 @@ def get_exons_from_transcript_id(transcript_id: str, id_version: int):
response = requests.get(api_url, headers={"Content-Type": "application/json"})
transcript_json = response.json()
if transcript_json["count"] != 1:
raise ValueError(
f"Non-unique entry for transcript ID and version:\n{transcript_json}"
)
if transcript_json["count"] > 1:
api_url = f"http://tark.ensembl.org/api/transcript/?stable_id={transcript_id}&stable_id_version={id_version}&assembly_name={ref_version}&expand=exons"
response = requests.get(api_url, headers={"Content-Type": "application/json"})
transcript_json = response.json()
if transcript_json["count"] != 1:
raise ValueError(
f"Non-unique entry for transcript ID , version {id_version} and assembly {ref_version}:\n{transcript_json}"
)
else:
raise ValueError(
f"No entry found for transcript ID {transcript_id}, version {id_version} and assembly {ref_version}"
)
transcript_record = transcript_json["results"][0]
exons_list = transcript_record["exons"]
strand = transcript_record["loc_strand"] # +1/-1
Expand Down Expand Up @@ -186,11 +195,11 @@ def get_exon_pos_seq(exon_id, id_version, cds_start, cds_end, ref_version="GRCh3
return chrom, list(sequence), list(range(start_pos, end_pos + 1))


def get_cds_seq_pos_from_gene_name(gene_name: str):
def get_cds_seq_pos_from_gene_name(gene_name: str, ref_version: str = "GRCh38"):
transcript_id, id_version = get_mane_transcript_id(gene_name)
print(f"MANE transcript ID {transcript_id} for {gene_name} will be used.")
exons_list, cds_start, cds_end, strand = get_exons_from_transcript_id(
transcript_id, id_version
transcript_id, id_version, ref_version
)
if strand == -1:
exons_list = exons_list[::-1]
Expand Down

0 comments on commit a7bf84b

Please sign in to comment.