refining exon selections with assembly version

pinellolab · Dec 4, 2023 · a7bf84b · a7bf84b
1 parent d7f16d4
commit a7bf84b
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -165,7 +165,7 @@ bean-profile my_sorting_screen.h5ad -o output_prefix `# Prefix for editing profi
 ### Output
 Above command produces `prefix_editing_preference.[html,ipynb]` as editing preferences ([see example](notebooks/profile_editing_preference.ipynb)).  
 
-<img src="imgs/profile_output.png" alt="Allele translation" width="700" style="background-color:white;"/>  
+<img src="imgs/profile_output.png" alt="Allele translation" width="700" />  
 
 ### Parameters
   * `-o`, `--output-prefix` (default: `None`): Output prefix of editing pattern report (prefix.html, prefix.ipynb). If not provided, base name of `bdata_path` is used.
@@ -188,7 +188,7 @@ bean-qc \
 
 `bean-qc` supports following quality control and masks samples with low quality. Specifically:  
 
-<img src="imgs/qc_output.png" alt="Allele translation" width="900" style="background-color:white;"/>  
+<img src="imgs/qc_output.png" alt="Allele translation" width="900"/>  
 
 * Plots guide coverage and the uniformity of coverage
 * Guide count correlation between samples
@@ -402,5 +402,5 @@ Python package `bean` supports multiple data wrangling functionalities for `Repo
 * Full pipeline takes 90.1s in GitHub Action for toy dataset of 2 replicates and 30 guides.
 
 ## Citation
-If you have used BEAN, please cite:  
+If you have used BEAN for your analysis, please cite:  
 Ryu, J. et al. Joint genotypic and phenotypic outcome modeling improves base editing variant effect quantification. medRxiv (2023) doi:10.1101/2023.09.08.23295253
diff --git a/bean/annotate/translate_allele.py b/bean/annotate/translate_allele.py
@@ -225,11 +225,11 @@ def from_fasta(cls, fasta_file_name, gene_name, suppressMessage=True):
         return cds
 
     @classmethod
-    def from_gene_name(cls, gene_name):
+    def from_gene_name(cls, gene_name, ref_version: str = "GRCh38"):
         cds = cls()
         if gene_name not in cls.gene_info_dict:
             chrom, translated_seq, genomic_pos, strand = get_cds_seq_pos_from_gene_name(
-                gene_name
+                gene_name, ref_version
             )
             cls.gene_info_dict[gene_name] = {
                 "chrom": chrom,
@@ -329,8 +329,8 @@ def get_mismatch_df():
     ).drop_duplicates()
 
 
-def get_cds_dict(gene_names):
-    return {gname: CDS.from_gene_name(gname) for gname in gene_names}
+def get_cds_dict(gene_names, ref_version: str = "GRCh38"):
+    return {gname: CDS.from_gene_name(gname, ref_version) for gname in gene_names}
 
 
 def export_gene_info_to_json(gene_dict, write_path=".tmp_gene_info.csv"):
@@ -418,14 +418,15 @@ def get_allele_aa_change_single_gene(
     fasta_file: str = None,
     fasta_file_id: str = None,
     include_synonymous: bool = True,
+    ref_version: str = "GRCh38",
 ):
     """
     Obtain amino acid changes
     """
     if gene_name is None and fasta_file is not None:
         cds = CDS.from_fasta(fasta_file, fasta_file_id)
     elif gene_name is not None and fasta_file is None:
-        cds = CDS.from_gene_name(gene_name)
+        cds = CDS.from_gene_name(gene_name, ref_version)
     else:
         raise ValueError("Only one of `gene_name` or `fasta_file` should be provided.")
     return cds.get_aa_change(allele, include_synonymous)

diff --git a/bean/annotate/utils.py b/bean/annotate/utils.py
@@ -97,7 +97,7 @@ def get_mane_transcript_id(gene_name: str):
     return mane_transcript_id, id_version
 
 
-def get_exons_from_transcript_id(transcript_id: str, id_version: int):
+def get_exons_from_transcript_id(transcript_id: str, id_version: int, ref_version: str = "GRCh38"):
     """
     Retrieves the exons and the start position of the coding sequence (CDS) for a given transcript ID and version.
 
@@ -112,9 +112,18 @@ def get_exons_from_transcript_id(transcript_id: str, id_version: int):
     response = requests.get(api_url, headers={"Content-Type": "application/json"})
     transcript_json = response.json()
     if transcript_json["count"] != 1:
-        raise ValueError(
-            f"Non-unique entry for transcript ID and version:\n{transcript_json}"
-        )
+        if transcript_json["count"] > 1:
+            api_url = f"http://tark.ensembl.org/api/transcript/?stable_id={transcript_id}&stable_id_version={id_version}&assembly_name={ref_version}&expand=exons"
+            response = requests.get(api_url, headers={"Content-Type": "application/json"})
+            transcript_json = response.json()
+            if transcript_json["count"] != 1:
+                raise ValueError(
+                    f"Non-unique entry for transcript ID , version {id_version} and assembly {ref_version}:\n{transcript_json}"
+                )
+        else:
+            raise ValueError(
+                f"No entry found for transcript ID {transcript_id}, version {id_version} and assembly {ref_version}"
+            )
     transcript_record = transcript_json["results"][0]
     exons_list = transcript_record["exons"]
     strand = transcript_record["loc_strand"]  # +1/-1
@@ -186,11 +195,11 @@ def get_exon_pos_seq(exon_id, id_version, cds_start, cds_end, ref_version="GRCh3
     return chrom, list(sequence), list(range(start_pos, end_pos + 1))
 
 
-def get_cds_seq_pos_from_gene_name(gene_name: str):
+def get_cds_seq_pos_from_gene_name(gene_name: str, ref_version: str = "GRCh38"):
     transcript_id, id_version = get_mane_transcript_id(gene_name)
     print(f"MANE transcript ID {transcript_id} for {gene_name} will be used.")
     exons_list, cds_start, cds_end, strand = get_exons_from_transcript_id(
-        transcript_id, id_version
+        transcript_id, id_version, ref_version
     )
     if strand == -1:
         exons_list = exons_list[::-1]