feat: repair GTFs that lack gene_type / gene_biotype in exons

CCBR · Dec 3, 2024 · 44057f4 · 44057f4
1 parent 2fcb67f
commit 44057f4
Show file tree

Hide file tree

Showing 5 changed files with 147 additions and 22 deletions.
diff --git a/config/cluster.json b/config/cluster.json
@@ -51,7 +51,8 @@
         "time": "1-00:00:00"
     },
     "qualimapinfo": {
-        "mem": "20g"
+        "mem": "20g",
+        "time": "1:00:00"
     },
     "rsem": {
         "gres": "lscratch:500",
@@ -120,5 +121,9 @@
     },
     "trim_se": {
         "threads": "32"
+    },
+    "reformat_gtf": {
+        "mem": "20g",
+        "time": "1:00:00"
     }
 }
diff --git a/config/containers/images.json b/config/containers/images.json
@@ -5,6 +5,7 @@
         "base": "docker://nciccbr/ccbr_ubuntu_22.04:v4",
         "bbtools": "docker://nciccbr/ccbr_bbtools_38.87:v0.0.1",
         "build_rnaseq": "docker://nciccbr/ccbr_build_rnaseq:v0.0.1",
+        "build_gtf": "docker://nciccbr/ccbr_build_rnaseq:v2",
         "cutadapt": "docker://nciccbr/ccbr_cutadapt_1.18:v032219",
         "fastq_screen": "docker://nciccbr/ccbr_fastq_screen_0.13.0:v2.0",
         "fastqc": "docker://nciccbr/ccbr_fastqc_0.11.9:v1.1",

diff --git a/workflow/rules/build.smk b/workflow/rules/build.smk
@@ -64,13 +64,13 @@ def str_bool(s):
 
 
 # Global Workflow variables
-configfile:join("config","build.yml")
+configfile: join("config","build.yml")
 GENOME=config["GENOME"].strip().replace(' ', '')
 # READLENGTHS=config["READLENGTHS"]
 REFFA=config["REFFA"]
-GTFFILE=config["GTFFILE"]
 GTFVER=config["GTFVER"].strip().replace(' ', '')
 OUTDIR=config["OUTDIR"]
+GTFFILE = join(OUTDIR, basename(config["GTFFILE"]) + '.reformat.gtf')
 SCRIPTSDIR=config["SCRIPTSDIR"]
 tmpdir=config["TMP_DIR"]
 workdir:OUTDIR
@@ -84,15 +84,8 @@ with open(join(OUTDIR, 'config', 'cluster.json')) as fh:
     cluster = json.load(fh)
 
 # Ensures backwards compatibility
-try:
-    SMALL_GENOME=config["SMALL_GENOME"]
-except KeyError:
-    SMALL_GENOME="False"
-
-try:
-    SHARED_PATH=config["SHARED_RESOURCES"]
-except KeyError:
-    SHARED_PATH="None"
+SMALL_GENOME=config["SMALL_GENOME"] if "SMALL_GENOME" in config else "False"
+SHARED_PATH=config["SHARED_RESOURCES"] if "SHARED_RESOURCES" in config else "None"
 
 
 rule all:
@@ -136,6 +129,21 @@ rule all:
         provided(expand(join(SHARED_PATH, "20180907_standard_kraken2", "{ref}.k2d"), ref=["hash", "opts", "taxo"]), SHARED_PATH),
 
 
+rule reformat_gtf:
+    """
+    Repairs GTF file to ensure it is in a format that is compatible with qualimap.
+    All exons must have a gene_type or gene_biotype column.
+    """
+    input:
+        gtf = config["GTFFILE"]
+    output:
+        gtf = GTFFILE
+    container: config['images']['build_gtf']
+    script:
+        join(SCRIPTSDIR, "repair_gtf.py")
+
+
+
 rule rsem:
     """
     Builds reference files for rsem to quantify gene and isoform counts.
@@ -512,16 +520,18 @@ rule qualimapinfo:
         gtf=GTFFILE,
     output:
         "qualimap_info.txt",
+    log:
+        stderr="qualimap_error.log",
     params:
         rname='bl:qualimapinfo',
         generate_qualimap=join(SCRIPTSDIR, "generate_qualimap_ref.py"),
-    container: config['images']['build_rnaseq']
+    container: config['images']['build_gtf']
     shell: """
     python3 {params.generate_qualimap} \\
         -g {input.gtf} \\
         -f {input.fa} \\
         -o {output} \\
-        --ignore-strange-chrom 2> qualimap_error.log
+        --ignore-strange-chrom 2> {log.stderr}
     """
 
 

diff --git a/workflow/scripts/builder/generate_qualimap_ref.py b/workflow/scripts/builder/generate_qualimap_ref.py
@@ -53,24 +53,31 @@ def write_qualimap_info(args):
     open_func = get_open(gtfFileName)
     with open_func(gtfFileName) as gtf_file:
         gtf_file = HTSeq.GFF_Reader(gtf_file)
-        features = collections.defaultdict(list)
+        exons_dict = collections.defaultdict(list)
+        other_features = []
         for feature in gtf_file:
             if feature.type == "exon":
                 geneName = feature.attr["gene_id"]
-                features[geneName].append(feature)
-
+                exons_dict[geneName].append(feature)
     outFile = open(outFileName, "w")
     outFile.write(f'"biotypes"\t"length"\t"gc"\n')
 
-    for geneId, exons in features.items():
+    for geneId, exons in exons_dict.items():
         print("Processing %s" % geneId)
 
         if len(exons) == 0:
             continue
-        try:
-            biotype = exons[0].attr["gene_type"]
-        except KeyError:
-            biotype = exons[0].attr["gene_biotype"]
+
+        biotypes = ["gene_type", "gene_biotype"]
+        biotype = None
+        for biotype_str in biotypes:
+            if biotype_str in exons[0].attr:
+                biotype = exons[0].attr[biotype_str]
+                break
+        if not biotype:
+            raise ValueError(
+                f"No biotype found for exon {exons[0].attr['transcript_id']}. Exons must have at least one of these attributes: {', '.join(biotypes)}"
+            )
 
         # build transcripts dictionary
         transcripts = {}

diff --git a/workflow/scripts/builder/repair_gtf.py b/workflow/scripts/builder/repair_gtf.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+import sys
+import collections
+import gzip
+import HTSeq
+import functools
+import sys
+
+
+def get_open_func(filename):
+    return functools.partial(gzip.open, mode="rt") if filename.endswith(".gz") else open
+
+
+def retrieve_header(in_filename):
+    open_func = get_open_func(in_filename)
+    with open_func(in_filename) as file_handle:
+        header = list()
+        for line in file_handle:
+            if line.startswith("#"):
+                header.append(line)
+            else:
+                break
+    return header
+
+
+def build_genes_dict(in_filename):
+    """
+    Create a dictionary mapping gene IDs to feature objects from HTSeq.GFF_Reader
+    """
+    open_func = get_open_func(in_filename)
+    with open_func(in_filename) as gtf_file:
+        gtf_file = HTSeq.GFF_Reader(gtf_file)
+        genes_dict = {}
+        for feature in gtf_file:
+            if feature.type == "gene":
+                gene_id = feature.attr["gene_id"]
+                if gene_id in genes_dict:
+                    raise KeyError(f"Duplicate gene ID found: {gene_id}")
+                genes_dict[gene_id] = feature
+    return genes_dict
+
+
+def fix_exon_attrs(exon_feature, genes_dict, debug=True):
+    """
+    If exon_feature does not have 'gene_type' or 'gene_biotype', retrieve them from the gene feature
+    """
+    biotypes = ["gene_type", "gene_biotype"]
+    if not any(key in exon_feature.attr for key in biotypes):
+        debug_msg = f"Exon {exon_feature.attr['gene_id']} missing 'gene_type' or 'gene_biotype'. Adding "
+        gene = genes_dict[exon_feature.attr["gene_id"]]
+        if "gene_type" in gene.attr:
+            exon_feature.attr["gene_type"] = gene.attr["gene_type"]
+            debug_msg += f"'gene_type'={gene.attr['gene_type']} "
+        if "gene_biotype" in gene.attr:
+            exon_feature.attr["gene_biotype"] = gene.attr["gene_biotype"]
+            debug_msg += f"'gene_biotype'={gene.attr['gene_biotype']} "
+        if not any(key in gene.attr for key in biotypes):
+            raise KeyError(
+                f"Gene {exon_feature.attr['gene_id']} does not have 'gene_type' or 'gene_biotype'"
+            )
+        if debug:
+            print(debug_msg)
+    return exon_feature
+
+
+def repair_gtf(in_filename, out_filename, debug=False):
+    # build a dictionary of gene features
+    genes_dict = build_genes_dict(in_filename)
+
+    # iterate over features and add gene_type/biotype if needed
+    new_gtf_lines = retrieve_header(in_filename)
+    open_func = get_open_func(in_filename)
+    with open_func(in_filename) as gtf_file:
+        gtf_file = HTSeq.GFF_Reader(gtf_file)
+        for feature in gtf_file:
+            if feature.type == "exon":
+                feature = fix_exon_attrs(feature, genes_dict, debug=debug)
+            new_gtf_lines.append(feature.get_gff_line())
+
+    # write the new GTF file
+    with open(out_filename, "w") as out_file:
+        out_file.writelines(new_gtf_lines)
+
+
+if __name__ == "__main__":
+    if "snakemake" in locals() or "snakemake" in globals():
+        in_filename = snakemake.input.gtf
+        out_filename = snakemake.output.gtf
+    else:
+        in_filename = sys.argv[1]
+        out_filename = sys.argv[2]
+    repair_gtf(in_filename, out_filename)
+
+test_cmd = """
+/data/CCBR_Pipeliner/Pipelines/RENEE/renee-dev-sovacool/bin/renee build \
+    --sif-cache /data/CCBR_Pipeliner/SIFs \
+    --ref-fa /data/CCBR_Pipeliner/db/PipeDB/GDC_refs/downloads/GRCh38.d1.vd1.fa \
+    --ref-name hg38 \
+    --ref-gtf /data/CCBR_Pipeliner/db/PipeDB/Indices/GTFs/hg38/gencode.v45.primary_assembly.annotation.gtf \
+    --gtf-ver v45-test \
+    --output /data/CCBR_Pipeliner/db/PipeDB/Indices/test
+"""