Skip to content

Commit

Permalink
feat: repair GTFs that lack gene_type / gene_biotype in exons
Browse files Browse the repository at this point in the history
  • Loading branch information
kelly-sovacool committed Dec 3, 2024
1 parent 2fcb67f commit 44057f4
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 22 deletions.
7 changes: 6 additions & 1 deletion config/cluster.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@
"time": "1-00:00:00"
},
"qualimapinfo": {
"mem": "20g"
"mem": "20g",
"time": "1:00:00"
},
"rsem": {
"gres": "lscratch:500",
Expand Down Expand Up @@ -120,5 +121,9 @@
},
"trim_se": {
"threads": "32"
},
"reformat_gtf": {
"mem": "20g",
"time": "1:00:00"
}
}
1 change: 1 addition & 0 deletions config/containers/images.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"base": "docker://nciccbr/ccbr_ubuntu_22.04:v4",
"bbtools": "docker://nciccbr/ccbr_bbtools_38.87:v0.0.1",
"build_rnaseq": "docker://nciccbr/ccbr_build_rnaseq:v0.0.1",
"build_gtf": "docker://nciccbr/ccbr_build_rnaseq:v2",
"cutadapt": "docker://nciccbr/ccbr_cutadapt_1.18:v032219",
"fastq_screen": "docker://nciccbr/ccbr_fastq_screen_0.13.0:v2.0",
"fastqc": "docker://nciccbr/ccbr_fastqc_0.11.9:v1.1",
Expand Down
36 changes: 23 additions & 13 deletions workflow/rules/build.smk
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,13 @@ def str_bool(s):


# Global Workflow variables
configfile:join("config","build.yml")
configfile: join("config","build.yml")
GENOME=config["GENOME"].strip().replace(' ', '')
# READLENGTHS=config["READLENGTHS"]
REFFA=config["REFFA"]
GTFFILE=config["GTFFILE"]
GTFVER=config["GTFVER"].strip().replace(' ', '')
OUTDIR=config["OUTDIR"]
GTFFILE = join(OUTDIR, basename(config["GTFFILE"]) + '.reformat.gtf')
SCRIPTSDIR=config["SCRIPTSDIR"]
tmpdir=config["TMP_DIR"]
workdir:OUTDIR
Expand All @@ -84,15 +84,8 @@ with open(join(OUTDIR, 'config', 'cluster.json')) as fh:
cluster = json.load(fh)

# Ensures backwards compatibility
try:
SMALL_GENOME=config["SMALL_GENOME"]
except KeyError:
SMALL_GENOME="False"

try:
SHARED_PATH=config["SHARED_RESOURCES"]
except KeyError:
SHARED_PATH="None"
SMALL_GENOME=config["SMALL_GENOME"] if "SMALL_GENOME" in config else "False"
SHARED_PATH=config["SHARED_RESOURCES"] if "SHARED_RESOURCES" in config else "None"


rule all:
Expand Down Expand Up @@ -136,6 +129,21 @@ rule all:
provided(expand(join(SHARED_PATH, "20180907_standard_kraken2", "{ref}.k2d"), ref=["hash", "opts", "taxo"]), SHARED_PATH),


rule reformat_gtf:
"""
Repairs GTF file to ensure it is in a format that is compatible with qualimap.
All exons must have a gene_type or gene_biotype column.
"""
input:
gtf = config["GTFFILE"]
output:
gtf = GTFFILE
container: config['images']['build_gtf']
script:
join(SCRIPTSDIR, "repair_gtf.py")



rule rsem:
"""
Builds reference files for rsem to quantify gene and isoform counts.
Expand Down Expand Up @@ -512,16 +520,18 @@ rule qualimapinfo:
gtf=GTFFILE,
output:
"qualimap_info.txt",
log:
stderr="qualimap_error.log",
params:
rname='bl:qualimapinfo',
generate_qualimap=join(SCRIPTSDIR, "generate_qualimap_ref.py"),
container: config['images']['build_rnaseq']
container: config['images']['build_gtf']
shell: """
python3 {params.generate_qualimap} \\
-g {input.gtf} \\
-f {input.fa} \\
-o {output} \\
--ignore-strange-chrom 2> qualimap_error.log
--ignore-strange-chrom 2> {log.stderr}
"""


Expand Down
23 changes: 15 additions & 8 deletions workflow/scripts/builder/generate_qualimap_ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,24 +53,31 @@ def write_qualimap_info(args):
open_func = get_open(gtfFileName)
with open_func(gtfFileName) as gtf_file:
gtf_file = HTSeq.GFF_Reader(gtf_file)
features = collections.defaultdict(list)
exons_dict = collections.defaultdict(list)
other_features = []
for feature in gtf_file:
if feature.type == "exon":
geneName = feature.attr["gene_id"]
features[geneName].append(feature)

exons_dict[geneName].append(feature)
outFile = open(outFileName, "w")
outFile.write(f'"biotypes"\t"length"\t"gc"\n')

for geneId, exons in features.items():
for geneId, exons in exons_dict.items():
print("Processing %s" % geneId)

if len(exons) == 0:
continue
try:
biotype = exons[0].attr["gene_type"]
except KeyError:
biotype = exons[0].attr["gene_biotype"]

biotypes = ["gene_type", "gene_biotype"]
biotype = None
for biotype_str in biotypes:
if biotype_str in exons[0].attr:
biotype = exons[0].attr[biotype_str]
break
if not biotype:
raise ValueError(
f"No biotype found for exon {exons[0].attr['transcript_id']}. Exons must have at least one of these attributes: {', '.join(biotypes)}"
)

# build transcripts dictionary
transcripts = {}
Expand Down
102 changes: 102 additions & 0 deletions workflow/scripts/builder/repair_gtf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python
import sys
import collections
import gzip
import HTSeq
import functools
import sys


def get_open_func(filename):
return functools.partial(gzip.open, mode="rt") if filename.endswith(".gz") else open


def retrieve_header(in_filename):
open_func = get_open_func(in_filename)
with open_func(in_filename) as file_handle:
header = list()
for line in file_handle:
if line.startswith("#"):
header.append(line)
else:
break
return header


def build_genes_dict(in_filename):
"""
Create a dictionary mapping gene IDs to feature objects from HTSeq.GFF_Reader
"""
open_func = get_open_func(in_filename)
with open_func(in_filename) as gtf_file:
gtf_file = HTSeq.GFF_Reader(gtf_file)
genes_dict = {}
for feature in gtf_file:
if feature.type == "gene":
gene_id = feature.attr["gene_id"]
if gene_id in genes_dict:
raise KeyError(f"Duplicate gene ID found: {gene_id}")
genes_dict[gene_id] = feature
return genes_dict


def fix_exon_attrs(exon_feature, genes_dict, debug=True):
"""
If exon_feature does not have 'gene_type' or 'gene_biotype', retrieve them from the gene feature
"""
biotypes = ["gene_type", "gene_biotype"]
if not any(key in exon_feature.attr for key in biotypes):
debug_msg = f"Exon {exon_feature.attr['gene_id']} missing 'gene_type' or 'gene_biotype'. Adding "
gene = genes_dict[exon_feature.attr["gene_id"]]
if "gene_type" in gene.attr:
exon_feature.attr["gene_type"] = gene.attr["gene_type"]
debug_msg += f"'gene_type'={gene.attr['gene_type']} "
if "gene_biotype" in gene.attr:
exon_feature.attr["gene_biotype"] = gene.attr["gene_biotype"]
debug_msg += f"'gene_biotype'={gene.attr['gene_biotype']} "
if not any(key in gene.attr for key in biotypes):
raise KeyError(
f"Gene {exon_feature.attr['gene_id']} does not have 'gene_type' or 'gene_biotype'"
)
if debug:
print(debug_msg)
return exon_feature


def repair_gtf(in_filename, out_filename, debug=False):
# build a dictionary of gene features
genes_dict = build_genes_dict(in_filename)

# iterate over features and add gene_type/biotype if needed
new_gtf_lines = retrieve_header(in_filename)
open_func = get_open_func(in_filename)
with open_func(in_filename) as gtf_file:
gtf_file = HTSeq.GFF_Reader(gtf_file)
for feature in gtf_file:
if feature.type == "exon":
feature = fix_exon_attrs(feature, genes_dict, debug=debug)
new_gtf_lines.append(feature.get_gff_line())

# write the new GTF file
with open(out_filename, "w") as out_file:
out_file.writelines(new_gtf_lines)


if __name__ == "__main__":
if "snakemake" in locals() or "snakemake" in globals():
in_filename = snakemake.input.gtf
out_filename = snakemake.output.gtf
else:
in_filename = sys.argv[1]
out_filename = sys.argv[2]
repair_gtf(in_filename, out_filename)

test_cmd = """
/data/CCBR_Pipeliner/Pipelines/RENEE/renee-dev-sovacool/bin/renee build \
--sif-cache /data/CCBR_Pipeliner/SIFs \
--ref-fa /data/CCBR_Pipeliner/db/PipeDB/GDC_refs/downloads/GRCh38.d1.vd1.fa \
--ref-name hg38 \
--ref-gtf /data/CCBR_Pipeliner/db/PipeDB/Indices/GTFs/hg38/gencode.v45.primary_assembly.annotation.gtf \
--gtf-ver v45-test \
--output /data/CCBR_Pipeliner/db/PipeDB/Indices/test
"""

0 comments on commit 44057f4

Please sign in to comment.