Skip to content

Commit

Permalink
edit genebed generation to function with v111 nomenclature
Browse files Browse the repository at this point in the history
  • Loading branch information
pintoa1-mskcc committed Nov 7, 2024
1 parent a70828d commit 612a80a
Show file tree
Hide file tree
Showing 4 changed files with 291 additions and 25 deletions.
122 changes: 122 additions & 0 deletions bin/final_generate_v111_gene_bed.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/local/bin/Rscript

# __author__ = "Alexandria Dymun"
# __email__ = "[email protected]"
# __contributor__ = "Anne Marie Noronha ([email protected])"
# __version__ = "0.0.1"
# __status__ = "Dev"


suppressPackageStartupMessages({
library(plyr)
library(dplyr)
library(data.table)
library(stringr)
options(scipen = 999)
})

usage <- function() {
message("Usage:")
message("final_generate_v111_gene_bed.R <in.gff> <out.bed>")
}

args = commandArgs(TRUE)

if (length(args)!=2) {
usage()
quit()
}

gtf <- rtracklayer::import(args[1])
gtf_df <- as.data.frame(gtf)
#remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished)
gtf_df <- gtf_df[!grepl("NF",gtf_df$tag),]

file.to_write <- args[2]

### convert start to 0-based to match metafusion expectations of gff format
gtf_df <- gtf_df %>%
rename(
chr = seqnames
) %>%
select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>%
filter(type %in% c("exon","intron","five_prime_utr","three_prime_utr","CDS")) %>%
mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) %>% mutate(start = start-1)


#START CLOCK
ptm <- proc.time()
print(ptm)

# Index each transcript feature, incrementing when an intron is passed
## metafusion expects exon count 0 to (N(exons)-1)
## Forward strand: Exon 0 == Exon 1
### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT

print(dim(gtf_df))
print(length(unique(gtf_df$transcript_id)))

modify_transcript <- function(transcript){

# Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another
if ("CDS" %in% transcript$type){
transcript <- transcript[!transcript$type == "exon",]
}
# Order features by increasing bp
transcript <- transcript[order(transcript$start, decreasing = FALSE),]
# Index features
idx <- 0
for (i in 1:nrow(transcript)){
transcript$idx[i]<- idx
if (transcript$type[i] == "intron"){
idx <- idx + 1
}
}
# REFORMAT TRANSCRIPT
#Change strand info (+ --> f, - --> r)
if (unique(transcript$strand) == "+"){
transcript$strand <- 'f'
} else if (unique(transcript$strand) == "-"){
transcript$strand <- 'r'
} else {
errorCondition("Strand info for this transcript is inconsistent")
}
#Add "chr" prefix to chromosomes
transcript$chr <- sapply("chr", paste0, transcript$chr)
#Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later
transcript <- transcript %>% mutate(type = as.character(type))
transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type))
transcript$type[transcript$type == "five_prime_utr"] <- "utr5"
transcript$type[transcript$type == "three_prime_utr"] <- "utr3"


#### Any exon that remains after the cds change, is likely and untranslated region. change below
# Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5)
#Forward strand
transcript$type[transcript$strand == "f" & transcript$type == "exon" ] <- "utr5"
#Reverse strand
transcript$type[transcript$strand == "r" & transcript$type == "exon"]<- "utr3"
expected_types <- c("cds","intron","utr3","utr5")
transcript <- transcript[transcript$type %in% c(expected_types),]
return(transcript)
}

if(file.exists(file.to_write) ) {file.remove(file.to_write)}

gtf_df_modified <- gtf_df %>%
group_by(transcript_id,.drop = FALSE) %>%
group_modify(~ modify_transcript(.x)) %>%
select(c(chr, start, end, transcript_id, type, idx, strand, gene_name, gene_id )) %>%
arrange(chr,start,end)

time <- proc.time() - ptm
print(time)

write.table(
gtf_df_modified,
file.to_write,
sep="\t",
quote=F,
row.names=F,
col.names=F
)
69 changes: 46 additions & 23 deletions modules/local/metafusion/genebed/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@ process METAFUSION_GENEBED {

input:
tuple val(meta), path(gff)
val ensembl_version

output:
tuple val(meta), path("*.metafusion.gene.bed"), emit: metafusion_gene_bed
tuple val(meta), path("${meta.id}.metafusion.gene.bed"), emit: metafusion_gene_bed
path "versions.yml" , emit: versions

when:
Expand All @@ -21,28 +20,52 @@ process METAFUSION_GENEBED {
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
final_generate_v75_gene_bed.R \\
$gff \\
${ensembl_version}.metafusion.gene.bed
cat <<-END_VERSIONS > versions.yml
"${task.process}":
R: \$(R --version | head -n1)
final_generate_v75_gene_bed.R: 0.0.2
END_VERSIONS
"""
if( prefix == 'GRCh37' )
"""
final_generate_v75_gene_bed.R \\
$gff \\
${prefix}.metafusion.gene.bed
cat <<-END_VERSIONS > versions.yml
"${task.process}":
R: \$(R --version | head -n1)
final_generate_v75_gene_bed.R: 0.0.2
END_VERSIONS
"""
else if( prefix == 'GRCh38' )
"""
final_generate_v111_gene_bed.R \\
$gff \\
${prefix}.metafusion.gene.bed
cat <<-END_VERSIONS > versions.yml
"${task.process}":
R: \$(R --version | head -n1)
final_generate_v111_gene_bed.R: 0.0.1
END_VERSIONS
"""

stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
touch ${prefix}.metafusion.gene.bed
cat <<-END_VERSIONS > versions.yml
"${task.process}":
R: \$(R --version | head -n1)
final_generate_v75_gene_bed.R: 0.0.2
END_VERSIONS
"""
}
if( prefix == 'GRCh37' )
"""
touch ${prefix}.metafusion.gene.bed
cat <<-END_VERSIONS > versions.yml
"${task.process}":
R: \$(R --version | head -n1)
final_generate_v75_gene_bed.R: 0.0.2
END_VERSIONS
"""
else if( prefix == 'GRCh38' )
"""
touch ${prefix}.metafusion.gene.bed
cat <<-END_VERSIONS > versions.yml
"${task.process}":
R: \$(R --version | head -n1)
final_generate_v111_gene_bed.R: 0.0.1
END_VERSIONS
"""
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/local/bin/Rscript

# __author__ = "Alexandria Dymun"
# __email__ = "[email protected]"
# __contributor__ = "Anne Marie Noronha ([email protected])"
# __version__ = "0.0.1"
# __status__ = "Dev"


suppressPackageStartupMessages({
library(plyr)
library(dplyr)
library(data.table)
library(stringr)
options(scipen = 999)
})

usage <- function() {
message("Usage:")
message("final_generate_v111_gene_bed.R <in.gff> <out.bed>")
}

args = commandArgs(TRUE)

if (length(args)!=2) {
usage()
quit()
}

gtf <- rtracklayer::import(args[1])
gtf_df <- as.data.frame(gtf)
#remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished)
gtf_df <- gtf_df[!grepl("NF",gtf_df$tag),]

file.to_write <- args[2]

### convert start to 0-based to match metafusion expectations of gff format
gtf_df <- gtf_df %>%
rename(
chr = seqnames
) %>%
select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>%
filter(type %in% c("exon","intron","five_prime_utr","three_prime_utr","CDS")) %>%
mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) %>% mutate(start = start-1)


#START CLOCK
ptm <- proc.time()
print(ptm)

# Index each transcript feature, incrementing when an intron is passed
## metafusion expects exon count 0 to (N(exons)-1)
## Forward strand: Exon 0 == Exon 1
### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT

print(dim(gtf_df))
print(length(unique(gtf_df$transcript_id)))

modify_transcript <- function(transcript){

# Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another
if ("CDS" %in% transcript$type){
transcript <- transcript[!transcript$type == "exon",]
}
# Order features by increasing bp
transcript <- transcript[order(transcript$start, decreasing = FALSE),]
# Index features
idx <- 0
for (i in 1:nrow(transcript)){
transcript$idx[i]<- idx
if (transcript$type[i] == "intron"){
idx <- idx + 1
}
}
# REFORMAT TRANSCRIPT
#Change strand info (+ --> f, - --> r)
if (unique(transcript$strand) == "+"){
transcript$strand <- 'f'
} else if (unique(transcript$strand) == "-"){
transcript$strand <- 'r'
} else {
errorCondition("Strand info for this transcript is inconsistent")
}
#Add "chr" prefix to chromosomes
transcript$chr <- sapply("chr", paste0, transcript$chr)
#Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later
transcript <- transcript %>% mutate(type = as.character(type))
transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type))
transcript$type[transcript$type == "five_prime_utr"] <- "utr5"
transcript$type[transcript$type == "three_prime_utr"] <- "utr3"
}

#### Any exon that remains after the cds change, is likely and untranslated region. change below
# Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5)
#Forward strand
transcript$type[transcript$strand == "f" & transcript$type == "exon" ] <- "utr5"
#Reverse strand
transcript$type[transcript$strand == "r" & transcript$type == "exon"]<- "utr3"
expected_types <- c("cds","intron","utr3","utr5")
transcript <- transcript[transcript$type %in% c(expected_types),]
return(transcript)
}

if(file.exists(file.to_write) ) {file.remove(file.to_write)}

gtf_df_modified <- gtf_df %>%
group_by(transcript_id,.drop = FALSE) %>%
group_modify(~ modify_transcript(.x)) %>%
select(c(chr, start, end, transcript_id, type, idx, strand, gene_name, gene_id )) %>%
arrange(chr,start,end)

time <- proc.time() - ptm
print(time)

write.table(
gtf_df_modified,
file.to_write,
sep="\t",
quote=F,
row.names=F,
col.names=F
)
3 changes: 1 addition & 2 deletions subworkflows/local/prepare_references.nf
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,7 @@ workflow PREPARE_REFERENCES {
)

METAFUSION_GENEBED(
AGAT_SPADDINTRONS.out.gff,
params.ensembl_version
AGAT_SPADDINTRONS.out.gff
)

METAFUSION_GENEINFO(
Expand Down

0 comments on commit 612a80a

Please sign in to comment.