diff --git a/.gitignore b/.gitignore index e69de29..408286c 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1,7 @@ +work/* +runs/* +logs/* +*log +.nextflow.log* +.nextflow* +results/* diff --git a/bin/Metafusion_forte.sh b/bin/Metafusion_forte.sh new file mode 100755 index 0000000..53874b6 --- /dev/null +++ b/bin/Metafusion_forte.sh @@ -0,0 +1,159 @@ +#!/bin/bash +#STEPS + +# __author__ = "Alexandria Dymun" +# __email__ = "pintoa1@mskcc.org" +# __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" +# __version__ = "0.0.1" +# __status__ = "Dev" + + +output_ANC_RT_SG=1 +RT_call_filter=1 +blck_filter=1 +ANC_filter=1 +usage() { + echo "Usage: Metafusion_forte.sh --num_tools= --genome_fasta --recurrent_bedpe --outdir --cff --gene_bed --gene_info " 1>&2; + exit 1; +} + +# Loop through arguments and process them +while test $# -gt 0;do + case $1 in + -n=*|--num_tools=*) + num_tools="${1#*=}" + shift + ;; + --outdir) + outdir="$2" + shift 2 + ;; + --cff) + cff="$2" + shift 2 + ;; + --gene_bed) + gene_bed="$2" + shift 2 + ;; + --gene_info) + gene_info="$2" + shift 2 + ;; + --genome_fasta) + genome_fasta="$2" + shift 2 + ;; + --recurrent_bedpe) + recurrent_bedpe="$2" + shift 2 + ;; + *) + #OTHER_ARGUMENTS+=("$1") + shift # Remove generic argument from processing + ;; + esac +done + +if [[ ! $cff || ! $gene_info || ! $gene_bed ]]; then + echo "Missing required argument" + usage +fi + + +mkdir $outdir + +#Check CFF file format: +#Remove entries with nonconformming chromosome name + +all_gene_bed_chrs=`awk -F '\t' '{print $1}' $gene_bed | sort | uniq | sed 's/chr//g '` +awk -F " " -v arr="${all_gene_bed_chrs[*]}" 'BEGIN{OFS = "\t"; split(arr,arr1); for(i in arr1) dict[arr1[i]]=""} $1 in dict && $4 in dict' $cff > $outdir/$(basename $cff).cleaned_chr +grep -v -f $outdir/$(basename $cff).cleaned_chr $cff > problematic_chromosomes.cff +cff=$outdir/$(basename $cff).cleaned_chr + +#Rename cff +echo Rename cff +rename_cff_file_genes.MetaFusion.py $cff $gene_info > $outdir/$(basename $cff).renamed +cff=$outdir/$(basename $cff).renamed + +#Annotate cff +if [ $genome_fasta ]; then + echo Annotate cff, extract sequence surrounding breakpoint + reann_cff_fusion.py --cff $cff --gene_bed $gene_bed --ref_fa $genome_fasta > $outdir/$(basename $cff).reann.WITH_SEQ +else + echo Annotate cff, no extraction of sequence surrounding breakpoint + reann_cff_fusion.py --cff $cff --gene_bed $gene_bed > $outdir/$(basename $cff).reann.NO_SEQ +fi + +# Assign .cff based on SEQ or NOSEQ +if [ $genome_fasta ]; then + cff=$outdir/$(basename $cff).reann.WITH_SEQ + echo cff $cff +else + cff=$outdir/$(basename $cff).reann.NO_SEQ + echo cff $cff +fi + +echo Add adjacent exons to cff +extract_closest_exons.py $cff $gene_bed $genome_fasta > $outdir/$(basename $cff).exons + +# assign cff as ".exons" if --annotate_exons flag was specified + +cff=$outdir/$(basename $cff).exons + + +#Merge +cluster=$outdir/$(basename $cff).cluster +echo Merge cff by genes and breakpoints +RUN_cluster_genes_breakpoints.sh $cff $outdir > $cluster + +#output ANC_RT_SG file +if [ $output_ANC_RT_SG -eq 1 ]; then + echo output cis-sage.cluster file + output_ANC_RT_SG.py $cluster > $outdir/cis-sage.cluster +fi + +#ReadThrough Callerfilter +if [ $RT_call_filter -eq 1 ]; then + echo ReadThrough, callerfilter $num_tools + cat $cluster | grep ReadThrough > $outdir/$(basename $cluster).ReadThrough + callerfilter_num.py --cluster $cluster --num_tools $num_tools > $outdir/$(basename $cluster).callerfilter.$num_tools + callerfilter_excluded=$(comm -13 <(cut -f 22 $outdir/$(basename $cluster).callerfilter.$num_tools | sort | uniq) <(cut -f 22 $cluster | sort | uniq)) + grep -v ReadThrough $outdir/$(basename $cluster).callerfilter.$num_tools > $outdir/$(basename $cluster).RT_filter.callerfilter.$num_tools + cluster_RT_call=$outdir/$(basename $cluster).RT_filter.callerfilter.$num_tools +fi +# Blocklist Filter +if [ $recurrent_bedpe ]; then + echo blocklist filter + blocklist_filter_recurrent_breakpoints.sh $cff $cluster_RT_call $outdir $recurrent_bedpe > $outdir/$(basename $cluster).RT_filter.callerfilter.$num_tools.blck_filter + + blocklist_cluster=$outdir/$(basename $cluster_RT_call).BLOCKLIST + + cluster=$outdir/$(basename $cluster).RT_filter.callerfilter.$num_tools.blck_filter +fi +# Adjacent Noncoding filter +if [ $ANC_filter -eq 1 ]; then + echo ANC adjacent noncoding filter + filter_adjacent_noncoding.py $cluster > $outdir/$(basename $cluster).ANC_filter + + cluster=$outdir/$(basename $cluster).ANC_filter +fi +#Rank and generate final.cluster +echo Rank and generate final.cluster +rank_cluster_file.py $cluster > $outdir/final.n$num_tools.cluster +cluster=$outdir/final.n$num_tools.cluster +### Generate filtered FID file +#out=`awk -F '\t' '{print $15}' $cluster | tail -n +2` +#out2=`awk -F '\t' '{print $22}' $outdir/cis-sage.cluster | tail -n +2` +#out3=`echo $out $out2` +#echo ${out3//,/ } > out4 +#out5=`tr ' ' '\n' < out4 | sort | uniq` + +#for this in $(echo $out5); do grep $this $cff; done >> $outdir/$(basename $cff).filtered.cff + +rm -f filters.txt +cut -f 22 *.BLOCKLIST | tr "," "\n" | sort | uniq | sed "s/$/\tblocklist/g" > filters.txt +cut -f 22 *.ANC_filter | tr "," "\n" | sort | uniq | sed "s/$/\tadjacent_noncoding/g" >> filters.txt +cut -f 22 *.ReadThrough | tr "," "\n" | sort | uniq | sed "s/$/\tread_through/g" >> filters.txt +echo -en "$callerfilter_excluded" | tr "," "\n" | sort | uniq | sed "s/$/\tcaller_filter/g" >> filters.txt + diff --git a/bin/add_annotations_cff.R b/bin/add_annotations_cff.R new file mode 100755 index 0000000..7ec1489 --- /dev/null +++ b/bin/add_annotations_cff.R @@ -0,0 +1,109 @@ +#!/usr/local/bin/Rscript +# __author__ = "Anne Marie Noronha" +# __email__ = "noronhaa@mskcc.org" +# __version__ = "0.0.1" + + +suppressPackageStartupMessages({ + library(dplyr) + library(data.table) +}) + +usage <- function() { + message("Usage:") + message("add_annotations_cff.R --cff-file --agfusion-file --oncokb-file --out-prefix ") +} + +args = commandArgs(TRUE) + +if (is.null(args) | length(args)<1) { + usage() + quit() +} + +#' Parse out options from a string without recourse to optparse +#' +#' @param x Long-form argument list like --opt1 val1 --opt2 val2 +#' +#' @return named list of options and values similar to optparse + +parse_args <- function(x){ + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) + + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) gsub('-','_',x[1]))) + parsed_args[! is.na(parsed_args)] +} + +args_opt <- parse_args(paste(args,collapse=" ")) + +possible_args = c("cff", "oncokb", "agfusion", "out_prefix") +if (length(setdiff(names(args_opt),possible_args)) > 0){ + message("Invalid options") + usage() + quit() +} + +if (length(setdiff(possible_args,names(args_opt))) > 0) { + message("Missing required arguments") + usage() + quit() +} + +oncokb_file = args_opt$oncokb +agfusion_file = args_opt$agfusion +cff_file = args_opt$cff +out_prefix = args_opt$out_prefix + +cff = fread(cff_file) +final_cff_cols <- c(names(cff)) +agfusion_tab = fread(agfusion_file) %>% select(c(`5'_transcript`,`3'_transcript`,`5'_breakpoint`,`3'_breakpoint`,Fusion_effect)) +final_cff_cols <- c(final_cff_cols,"Fusion_effect") +if (!is.null(oncokb_file)){ + oncokb_tab = fread(oncokb_file) %>% select(-Fusion) + final_cff_cols = c(final_cff_cols,names(oncokb_tab %>% select(-Tumor_Sample_Barcode))) + cff <- merge( + cff, + oncokb_tab, + by.x ="FID", + by.y = "Tumor_Sample_Barcode", + all.x = T, + all.y=F + ) +} + +cff <- merge( + cff, + agfusion_tab, + by.x = c("gene5_transcript_id","gene3_transcript_id","gene5_breakpoint","gene3_breakpoint"), + by.y = c("5'_transcript","3'_transcript","5'_breakpoint","3'_breakpoint"), + all.x = T, + all.y = T +) + +cff <- as.data.frame(cff)[,c(final_cff_cols)] +#cff <- cff %>% mutate(!!final_cff_cols[34] := Fusion_effect) %>% select(-c(Fusion_effect)) + +write.table( + cff, + paste0(out_prefix, ".unfiltered.cff"), + row.names = F, + quote = F, + sep = "\t", + col.names = ! "V1" %in% final_cff_cols +) + +filtered_cff <- cff %>% filter(! (is.na(cluster) | is.null(cluster) | cluster == "")) +write.table( + filtered_cff, + paste0(out_prefix, ".final.cff"), + row.names = F, + append = F, + quote = F, + sep = "\t", + col.names = ! "V1" %in% final_cff_cols +) + + diff --git a/bin/add_flags_and_cluster_information.R b/bin/add_flags_and_cluster_information.R new file mode 100755 index 0000000..b74ff92 --- /dev/null +++ b/bin/add_flags_and_cluster_information.R @@ -0,0 +1,199 @@ +#!/usr/local/bin/Rscript +# __author__ = "Alexandria Dymun" +# __email__ = "pintoa1@mskcc.org" +# __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" +# __version__ = "0.0.1" +# __status__ = "Dev" + + + +library(dplyr) +library(data.table) + args <- commandArgs(TRUE) + if (length(args) != 6) { + stop( + "6 arguments are required as input in the following order: unfiltered_cff cluster_file cis_sage_file problematic_chromosomes_file filters_table sample_name" + ) + } + + unfiltered_cff <- fread(args[1],data.table = F) + header_cff <- + c( + "gene5_chr", + "gene5_breakpoint", + "gene5_strand", + "gene3_chr", + "gene3_breakpoint", + "gene3_strand", + "library", + "sample", + "T_N", + "disease", + "tool", + "max_split_cnt", + "max_span_cnt", + "gene5_renamed_symbol", + "gene5_tool_annotation", + "gene3_renamed_symbol", + "gene3_tool_annotation", + "FusionType", + "reann_gene5_symbol", + "reann_gene5_region", + "reann_gene3_symbol", + "reann_gene3_region", + "reann_gene5_on_bndry", + "reann_gene5_close_to_bndry", + "reann_gene3_on_bndry", + "reann_gene3_close_to_bndry", + "score", + "coding_id_distance", + "gene_interval_distance", + "dnasupp", + "FID", + "gene5_seq", + "gene3_seq", + "is_inframe", + "closest_exon5", + "closest_exon3", + "captured_reads", + "gene5_transcript_id", + "gene3_transcript_id" + ) + colnames(unfiltered_cff) <- header_cff + cluster <- fread(args[2],data.table = F) + header_cluster <- + c( + "gene5_renamed_symbol", + "gene3_renamed_symbol", + "gene5_chr", + "gene5_breakpoint", + "gene3_chr", + "gene3_breakpoint", + "max_split_cnt", + "max_span_cnt", + "T_N", + "disease", + "tool", + "FusionType", + "sample", + "cancer_db_hits", + "FID" + ) + colnames(cluster) <- header_cluster + cluster <- cluster %>% + mutate(CID=as.character(row_number())) %>% + select(c(FID,CID)) %>% + tidyr::separate_rows(FID,sep=",") + + + sample_name <- args[6] + + cis_sage <- tryCatch({fread(args[3],data.table = F)},warning = function(cond){return( NULL)}) + if(!is.null(cis_sage)){ + header_cis <- + c( + "test", + "gene5_renamed_symbol", + "gene3_renamed_symbol", + "max_split_cnt", + "max_span_cnt", + "T_N", + "disease", + "tool", + "FusionType", + "reann_gene5_on_bndry", + "reann_gene5_close_to_bndry", + "reann_gene3_on_bndry", + "reann_gene3_close_to_bndry", + "is_inframe", + "sample", + "gene5_chr", + "gene5_breakpoint", + "gene3_chr", + "gene3_breakpoint", + "closest_exons5", + "closest_exons3", + "FID" + ) + + colnames(cis_sage) <- header_cis + + cis_sage <- cis_sage %>% + mutate(CID=paste0("cis_sage_",row_number())) %>% + select(c(FID,CID)) %>% + tidyr::separate_rows(FID,sep=",") + + df_cluster <- cluster %>% + bind_rows(cis_sage) %>% + group_by(FID) %>% + summarise(cluster=paste(CID,collapse=';')) + + unfiltered_cff <- merge(unfiltered_cff, df_cluster, by = "FID", all.x = T) + } + + unfiltered_cff$Metafusion_flag <- apply(unfiltered_cff, 1, function(row) { + if ((is.na(row["reann_gene5_symbol"]) || + is.na(row["reann_gene3_symbol"])) & (row["gene5_renamed_symbol"] != "." || + row["gene3_renamed_symbol"] != ".") ) { + return("Gene_or_loc_not_in_bed") + } else if (row["gene3_renamed_symbol"] != row["reann_gene3_symbol"] || + row["gene5_renamed_symbol"] != row["reann_gene5_symbol"]) { + return("Annot_gene_different_from_renamed") + } else{ + return(NA) + } + }) + + weird_chromosomes <- tryCatch({fread(args[4],data.table = F)},warning = function(cond){return( NULL)}) + if(!is.null(weird_chromosomes)){ + colnames(weird_chromosomes) <- + c( + "gene5_chr", + "gene5_breakpoint", + "gene5_strand", + "gene3_chr", + "gene3_breakpoint", + "gene3_strand", + "library", + "sample", + "T_N", + "disease", + "tool", + "max_split_cnt", + "max_span_cnt", + "gene5_renamed_symbol", + "gene5_tool_annotation", + "gene3_renamed_symbol", + "gene3_tool_annotation" + ) + weird_chromosomes[, colnames(unfiltered_cff)[!colnames(unfiltered_cff) %in% colnames(weird_chromosomes)]] <- NA + weird_chromosomes$Metafusion_flag <- "Chromosome_not_in_bed" + unfiltered_cff <- rbind(unfiltered_cff,weird_chromosomes) + } + + + filters <- tryCatch( + { + fread( + args[5], + data.table = F, + col.names=c("FID","tmpflag") + ) %>% group_by(FID) %>% summarise(tmpflag=paste(tmpflag,collapse='')) + }, + warning = function(cond){return( NULL)} + ) + if (!is.null(filters)){ + unfiltered_cff <- merge(unfiltered_cff,filters, by="FID", all.x = T, all.y = F) %>% + mutate(Metafusion_flag=ifelse(is.null(Metafusion_flag) | is.na(Metafusion_flag) | Metafusion_flag=="", tmpflag, paste(Metafusion_flag,tmpflag,sep=","))) %>% + select(-c(tmpflag)) + } + + write.table( + unfiltered_cff[,c(header_cff,"Metafusion_flag","cluster")], + paste0(sample_name, "_metafusion_cluster.unfiltered.cff"), + row.names = F, + append = F, + quote = F, + sep = "\t" + ) + diff --git a/bin/final_generate_v75_gene_bed.R b/bin/final_generate_v75_gene_bed.R new file mode 100644 index 0000000..57ecbe9 --- /dev/null +++ b/bin/final_generate_v75_gene_bed.R @@ -0,0 +1,113 @@ + +# __author__ = "Alexandria Dymun" +# __email__ = "pintoa1@mskcc.org" +# __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" +# __version__ = "0.0.1" +# __status__ = "Dev" + + + +library(dplyr) +library(data.table) +library(stringr) +gtf <- rtracklayer::import('/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf') +gtf_df <- as.data.frame(gtf) + + +# Utilized gtf from igenomes for FORTE This corresponds to GRCh37 ensembl 75 +# Add introns to gtf, convert to gff3 +# bsub -R "rusage[mem=64]" -o add_introns_agat_%J.out singularity exec -B /juno/ \\ +# -B /tmp -B /scratch/ docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0 \\ +# /bin/bash -c "agat_sp_add_introns.pl -g /juno/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf\\ +# -o genes.INTRONS.gff3" +# gff2bed < genes.INTRONS.gff3 > genes.INTRONS.agat.bed + + +total.introns.bed <- fread(file="/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/genes.INTRONS.agat.bed", header = FALSE, stringsAsFactors = F, sep="\t", na.strings = "",data.table = F) +colnames(total.introns.bed) <- c("chr","start","end","gene_id","tmp","strand","gene_biotype","type","V9","description") +total.introns.bed$transcript_id <- gsub("\\;.*","",str_split_fixed(total.introns.bed$description,"transcript_id=",n=2)[,2]) +total.introns.bed$gene_name <-gsub("\\;.*","",str_split_fixed(total.introns.bed$description,"gene_name=",n=2)[,2]) + +transcript_ids <- unique(total.introns.bed$transcript_id) +file.to_write <- "/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/cleaned_metafusion_v75_gene.bed" + +if(file.exists(file.to_write) ) {file.remove(file.to_write)} + +#START CLOCK: THE INDEXING TAKES A LONG TIME, LIKE 5 HOURS +ptm <- proc.time() + +# Index each transcript feature, incrementing when an intron is passed +## metafusion expects exon count 0 to (N(exons)-1) +## Forward strand: Exon 0 == Exon 1 +### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT +for (id in transcript_ids){ + transcript <- total.introns.bed[total.introns.bed$transcript_id == id,] + # Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another + if ("CDS" %in% transcript$type){ + transcript <- transcript[!transcript$type == "exon",] + } + # Order features by increasing bp + transcript <- transcript[order(transcript$start, decreasing = FALSE),] + # Index features + idx <- 0 + for (i in 1:nrow(transcript)){ + transcript$idx [i]<- idx + if (transcript$type[i] == "intron"){ + idx <- idx + 1 + } + } + # REFORMAT TRANSCRIPT + #Change strand info (+ --> f, - --> r) + if (unique(transcript$strand) == "+"){ + transcript$strand <- 'f' + } else if (unique(transcript$strand) == "-"){ + transcript$strand <- 'r' + } else { + errorCondition("Strand info for this transcript is inconsistent") + } + #Add "chr" prefix to chromosomes + transcript$chr <- sapply("chr", paste0, transcript$chr) + #Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later + if ("CDS" %in% unique(transcript$type)){transcript[transcript$type == "CDS",]$type <- "cds"} + ## DETERMING UTR3 and UTR5 + ### INSTEAD OF START AND STOP, USE CDS LOCATIONS AND STRAND INFORMATION..... + if ("UTR" %in% unique(transcript$type)){ + if( unique(transcript$strand) == "f"){ + #Forward strand + start_coding <- min(transcript[transcript$type == "cds","start"]) + stop_coding <- max(transcript[transcript$type == "cds","end"]) + transcript$type[transcript$end <= start_coding & transcript$type == "UTR"] <- "utr5" + transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr3" + }else { + start_coding <- max(transcript[transcript$type == "cds","end"]) + stop_coding <- min(transcript[transcript$type == "cds","start"]) + transcript$type[transcript$end <= start_coding & transcript$type == "UTR"] <- "utr3" + transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr5" + } + } + transcript <- transcript[,c("chr", "start", "end", "transcript_id", "type", "idx", "strand", "gene_name", "gene_id" )] + write.table(transcript, file.to_write, append=TRUE, sep="\t", quote=F, row.names=F, col.names=F) +} + +time <- proc.time() - ptm +time +# +# user system elapsed +# 16657.116 32.227 16741.382 + + +new.bed <- fread(file.to_write,data.table = F) +colnames(new.bed) <- c("chr","start","end","transcript_id","type","idx","strand","gene_name","gene_id") + +#### Any exon that remains after teh cds change, is likely and untranslated region. change below + +# Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5) +#Forward strand +new.bed$type[new.bed$strand == "f" & new.bed$type == "exon" ] <- "utr5" +#Reverse strand +new.bed$type[new.bed$strand == "r" & new.bed$type == "exon"]<- "utr3" + +expected_types <- c("cds","intron","utr3","utr5") +new.bed.ready <- new.bed[new.bed$type %in% c(expected_types),] + +write.table(new.bed.ready, "/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/v75_gene.bed", sep="\t", quote=F, row.names=F, col.names=F) diff --git a/bin/make_cff_from_forte.R b/bin/make_cff_from_forte.R new file mode 100755 index 0000000..f000bac --- /dev/null +++ b/bin/make_cff_from_forte.R @@ -0,0 +1,176 @@ +#!/usr/local/bin/Rscript + +# __author__ = "Alexandria Dymun" +# __email__ = "pintoa1@mskcc.org" +# __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" +# __version__ = "0.0.1" +# __status__ = "Dev" + + +library(dplyr) +library(data.table) +library(stringr) +cff_format <- + c( + "chr1", + "pos1", + "strand1", + "chr2", + "pos2", + "strand2", + "library", + "sample_name", + "sample_type", + "disease", + "tool", + 'split_cnt', + "span_cnt", + "t_gene1", + "t_area1", + "t_gene2", + "t_area2" + ) + + +opt <- commandArgs(TRUE) + +cff_format_df <- + setNames(data.frame(matrix( + ncol = length(cff_format), nrow = 0 + )), cff_format) + +make_arriba <- function(sample_file) { + df <- as.data.frame(matrix(ncol = 0, nrow = nrow(sample_file))) + + df$t_gene1 <- sample_file$gene_id1 + df$t_gene2 <- sample_file$gene_id2 + df$chr1 <- str_split_fixed(sample_file$breakpoint1, ":", 2)[, 1] + df$pos1 <- str_split_fixed(sample_file$breakpoint1, ":", 2)[, 2] + df$chr2 <- str_split_fixed(sample_file$breakpoint2, ":", 2)[, 1] + df$pos2 <- str_split_fixed(sample_file$breakpoint2, ":", 2)[, 2] + df$strand1 <- + str_split_fixed(sample_file$`strand1(gene/fusion)`, "/", 2)[, 1] + df$strand2 <- + str_split_fixed(sample_file$`strand2(gene/fusion)`, "/", 2)[, 1] + df$tool <- "arriba" + df$split_cnt <- + ifelse(!is.na(sample_file$split_reads1), + sample_file$split_reads1, + -1) + df$span_cnt <- + ifelse(!is.na(sample_file$discordant_mates), + sample_file$discordant_mates, + -1) + df$t_area1 <- sample_file$site1 + df$t_area2 <- sample_file$site2 + + return(df) +} + +make_fusioncatcher <- function(sample_file) { + df <- as.data.frame(matrix(ncol = 0, nrow = nrow(sample_file))) + + df$t_gene1 <- sample_file[, "Gene_1_id(5end_fusion_partner)"] + df$t_gene2 <- sample_file[, "Gene_2_id(3end_fusion_partner)"] + df$chr1 <- + str_split_fixed(sample_file[, "Fusion_point_for_gene_1(5end_fusion_partner)"], ":", 3)[, 1] + df$pos1 <- + str_split_fixed(sample_file[, "Fusion_point_for_gene_1(5end_fusion_partner)"], ":", 3)[, 2] + df$chr2 <- + str_split_fixed(sample_file[, "Fusion_point_for_gene_2(3end_fusion_partner)"], ":", 3)[, 1] + df$pos2 <- + str_split_fixed(sample_file[, "Fusion_point_for_gene_2(3end_fusion_partner)"], ":", 3)[, 2] + df$strand1 <- + str_split_fixed(sample_file[, "Fusion_point_for_gene_1(5end_fusion_partner)"], ":", 3)[, 3] + df$strand2 <- + str_split_fixed(sample_file[, "Fusion_point_for_gene_2(3end_fusion_partner)"], ":", 3)[, 3] + df$tool <- "fusioncatcher" + df$split_cnt <- + ifelse( + !is.na(sample_file$Spanning_unique_reads), + sample_file$Spanning_unique_reads, + -1 + ) + df$span_cnt <- + ifelse(!is.na(sample_file$Spanning_pairs), + sample_file$Spanning_pairs, + -1) + df$t_area1 <- sample_file$Predicted_effect + df$t_area2 <- sample_file$Predicted_effect + + + return(df) + +} + +make_starfusion <- function(sample_file) { + df <- as.data.frame(matrix(ncol = 0, nrow = nrow(sample_file))) + df$t_gene1 <- str_split_fixed(sample_file$LeftGene, "\\^", 2)[, 2] + df$t_gene2 <- str_split_fixed(sample_file$RightGene, "\\^", 2)[, 2] + df$chr1 <- + str_replace(str_split_fixed(sample_file$LeftBreakpoint, ":", 3)[, 1], + "chr", + "") + df$pos1 <- str_split_fixed(sample_file$LeftBreakpoint, ":", 3)[, 2] + df$chr2 <- + str_replace(str_split_fixed(sample_file$RightBreakpoint, ":", 3)[, 1], + "chr", + "") + df$pos2 <- str_split_fixed(sample_file$RightBreakpoint, ":", 3)[, 2] + df$strand1 <- + str_split_fixed(sample_file$LeftBreakpoint, ":", 3)[, 3] + df$strand2 <- + str_split_fixed(sample_file$RightBreakpoint, ":", 3)[, 3] + df$tool <- "starfusion" + df$split_cnt <- + ifelse(!is.na(sample_file$JunctionReadCount), + sample_file$JunctionReadCount, + -1) + df$span_cnt <- + ifelse(!is.na(sample_file$SpanningFragCount), + sample_file$SpanningFragCount, + -1) + df$t_area1 <- sample_file$SpliceType + df$t_area2 <- sample_file$SpliceType + + return(df) + +} + +sample_file <- fread(opt[2], data.table = F) + +tool_cff <- + setNames(data.frame(matrix( + ncol = length(cff_format), nrow = 0 + )), cff_format) +if (opt[1] == "arriba") { + if (nrow(sample_file) > 0) { + tool_cff <- make_arriba(sample_file) + } +} else if (opt[1] == "fusioncatcher") { + if (nrow(sample_file) > 0) { + tool_cff <- make_fusioncatcher(sample_file) + } +} else if (opt[1] == "starfusion") { + if (nrow(sample_file) > 0) { + tool_cff <- make_starfusion(sample_file) + } +} + +if (nrow(tool_cff) > 0) { + tool_cff$sample_name <- opt[3] + tool_cff$library <- "RNA" + tool_cff$sample_type <- "Tumor" + tool_cff$disease <- NA +} +tool_cff$strand1[tool_cff$strand1 == "."] <- NA +tool_cff$strand2[tool_cff$strand2 == "."] <- NA +tool_cff <- tool_cff[, cff_format] +write.table( + tool_cff, + opt[4], + sep = "\t", + row.names = F, + quote = F, + col.names = F +) diff --git a/bin/make_gene_info_for_forte.R b/bin/make_gene_info_for_forte.R new file mode 100644 index 0000000..1ef403c --- /dev/null +++ b/bin/make_gene_info_for_forte.R @@ -0,0 +1,83 @@ + +# __author__ = "Alexandria Dymun" +# __email__ = "pintoa1@mskcc.org" +# __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" +# __version__ = "0.0.1" +# __status__ = "Dev" + + + +library(dplyr) +library(stringr) +library(argparse) + +opt = commandArgs(TRUE) + +parser=ArgumentParser() +parser$add_argument("-p",'--primary_gtf',type="character",default = NULL,help = "Primary GTF, should match your bed file and arriba. Assumes ARRIBA is on primary gtf") +parser$add_argument("-c",'--fc_custom_bed_gene_names',type="character",default = NULL,help = "Fusioncatcher custom genes bed file") +parser$add_argument("-s",'--star_fusion_ref',type="character",default = NULL,help = "StarFusion GTF") +parser$add_argument("-f",'--fusioncatcher_ref',type="character",default = NULL,help = "Fusioncatcher GTF") +parser$add_argument("-o",'--outputDir',type="character",default = NULL,help = "outputDirectory to write gene_info and excess gene list") + +opt=parser$parse_args() + + +### primary gtf is v75, also used in arriba +primary_gtf <- as.data.frame(rtracklayer::import(opt$primary_gtf)) + +### Fusion catcher has custom gene names/gene_ids.... +## https://github.com/ndaniel/fusioncatcher/blob/ebc46fd1a8046fc909a56e09944a2ec2d69cc808/bin/add_custom_gene.py#L704-L715 +fc_custom_bed_gene_names <- read.table(opt$fc_custom_bed_gene_names) +fc_custom_bed_gene_names$gene_name <- str_split_fixed(fc_custom_bed_gene_names$V4,"-",n=2)[,1] +fc_custom_bed_gene_names$gene_id <- str_split_fixed(fc_custom_bed_gene_names$V4,"-",n=3)[,2] +star_fusion_ref <- as.data.frame(rtracklayer::import(opt$star_fusion_ref)) +fusioncatcher_ref <- as.data.frame(rtracklayer::import(opt$fusioncatcher_ref)) + + +all_my_gene_ids_and_names <- list(primary_gtf,fc_custom_bed_gene_names,star_fusion_ref,fusioncatcher_ref) +### whichever gtf you label as primary should also be the same version of the gene_bed file generated for metafusion +names(all_my_gene_ids_and_names) <- c("primary","one","two","three") +unique_id_to_names <- lapply( all_my_gene_ids_and_names,function(gtf) { + ### If gene id has versions, strip them off + if(all(grepl("\\.",gtf$gene_id))){ + gtf$gene_id_with_version <- gtf$gene_id + gtf$gene_id <- str_split_fixed(gtf$gene_id_with_version,"\\.",n=2)[,1] + return(unique(gtf[,c("gene_name","gene_id","gene_id_with_version")])) + } else{ + return(unique(gtf[,c("gene_name","gene_id")])) + } +}) + + +gene_info <- unique_id_to_names$primary +### tack on missing gene_ids from other references to gene info +versioned_gtf <-unlist(sapply(names(unique_id_to_names)[names(unique_id_to_names) != "primary"],function(name){ + if(any(colnames(unique_id_to_names[[name]]) == "gene_id_with_version")){ + return(name) + } +})) + + +add_these_exess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names(unique_id_to_names) != "primary"],function(name){ + add_symbols_and_ids <- unique_id_to_names[[name]] + add_symbols_and_ids <- add_symbols_and_ids[!add_symbols_and_ids$gene_id %in% gene_info$gene_id,] + if(name %in% versioned_gtf){ + add_symbols_and_ids <-add_symbols_and_ids[,c("gene_name","gene_id_with_version")] + colnames(add_symbols_and_ids) <- c("gene_name","gene_id") + } + return(add_symbols_and_ids) + +})) +# Excess genes being added (genes will be flagged as gene not in v75) +gene_info <- rbind(gene_info,add_these_exess_gene_ids) + +gene_info <- merge(gene_info,do.call(rbind,unique_id_to_names[versioned_gtf])[,c("gene_id","gene_id_with_version")],by = "gene_id",all.x = T, all.y = F) + +gene_info$Synonyms <- ifelse(is.na(gene_info$gene_id_with_version),gene_info$gene_id,paste0(gene_info$gene_id,"|",gene_info$gene_id_with_version)) +gene_info$Symbol <- gene_info$gene_name + +gene_info <- gene_info[,c("Symbol","Synonyms")] + +write.table(gene_info,paste0(opt$outputDir,"/gene.info"),sep ="\t",quote = F,row.names = F) +write.table(add_these_exess_gene_ids,paste0(opt$outputDir,"/excess_gene_ids.txt",sep ="\t",quote = F,row.names = F) diff --git a/conf/igenomes.config b/conf/igenomes.config index d734a1b..a85e19d 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -33,6 +33,10 @@ params { baits = "${params.targets_base}/${params.genome}/baits/agilent_v4/Agilent_v4_51MB_Human_b37_baits.bed" } } + metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/blocklist_breakpoints.bedpe.gz" + metafusion_gene_bed = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/v75_gene.bed.gz" + metafusion_gene_info = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/gene_info_20230714.txt" + ensembl_version = 75 } 'GRCh38' { fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" @@ -52,7 +56,12 @@ params { arriba_blacklist = "/usr/local/var/lib/arriba/blacklist_hg19_hs37d5_GRCh37_v2.3.0.tsv.gz" arriba_known_fusions = "/usr/local/var/lib/arriba/known_fusions_hg19_hs37d5_GRCh37_v2.3.0.tsv.gz" arriba_protein_domains = "/usr/local/var/lib/arriba/protein_domains_hg19_hs37d5_GRCh37_v2.3.0.gff3" - cdna = "https://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.75.cdna.all.fa.gz" + cdna = "http://ftp.ensemblgenomes.org/pub/viruses/fasta/sars_cov_2/cdna/Sars_cov_2.ASM985889v3.cdna.all.fa.gz" + metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/blocklist_breakpoints.bedpe" + metafusion_gene_bed = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/v75_gene.bed" + metafusion_gene_info = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/gene_info_20230714.txt" + ensembl_version = 75 + } /* 'hg38' { diff --git a/conf/modules.config b/conf/modules.config index 0927b20..2be37bf 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -171,22 +171,39 @@ process { ext.args = "--cosmic_usr ${params.cosmic_usr} --cosmic_passwd \$COSMIC_PASSWD" } - withName: FUSIONREPORT { - ext.when = ! params.skip_fusion_report - ext.args = { - [ - "-t ${params.fusion_report_cutoff}", - '--allow-multiple-gene-symbols', - '--export csv' - ].join(' ').trim() - } + withName: MERGE_CFF { + publishDir = [ + enabled: false + ] + } + + withName: METAFUSION { + ext.args = "--num_tools=${params.fusion_tool_cutoff}" + publishDir = [ + path: { "$params.outdir/analysis/${meta.id}/metafusion/intermediates" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + + ] + } + + withName: ADD_FLAG { + publishDir = [ + path: { "$params.outdir/analysis/${meta.id}/metafusion/intermediates" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } - withName: CSV_TO_TSV { + withName: AGFUSION_BATCH { + ext.args = "-a cff_transcript --middlestar" publishDir = [ - path: { "${params.outdir}/analysis/${meta.id}/fusionreport" }, + path: { "${params.outdir}/analysis/${meta.id}/agfusion" }, mode: params.publish_dir_mode, - pattern: '*.tsv' + pattern: 'fusion_transcripts.csv', + enabled: false ] } @@ -194,6 +211,22 @@ process { ext.when = params.run_oncokb_fusionannotator secret = ["ONCOKB_TOKEN"] ext.args = "-b \$ONCOKB_TOKEN" + publishDir = [ + path: { "${params.outdir}/analysis/${meta.id}/oncokb" }, + mode: params.publish_dir_mode, + pattern: '*.oncokb.tsv', + enabled: false + ] + } + + withName: CFF_FINALIZE { + ext.prefix = { "${meta.id}" } + publishDir = [ + path: { "${params.outdir}/analysis/${meta.id}/metafusion" }, + mode: params.publish_dir_mode, + pattern: '*.{final,unfiltered}.cff', + enabled: true + ] } withName: STAR_FOR_ARRIBA { diff --git a/main.nf b/main.nf index 8a4750e..5dfe5f6 100644 --- a/main.nf +++ b/main.nf @@ -24,6 +24,10 @@ params.cdna = WorkflowMain.getGenomeAttribute(params, 'cdna') params.arriba_blacklist = WorkflowMain.getGenomeAttribute(params, 'arriba_blacklist') params.arriba_known_fusions = WorkflowMain.getGenomeAttribute(params, 'arriba_known_fusions') params.arriba_protein_domains = WorkflowMain.getGenomeAttribute(params, 'arriba_protein_domains') +params.metafusion_blocklist = WorkflowMain.getGenomeAttribute(params, 'metafusion_blocklist') +params.metafusion_gene_bed = WorkflowMain.getGenomeAttribute(params, 'metafusion_gene_bed') +params.metafusion_gene_info = WorkflowMain.getGenomeAttribute(params, 'metafusion_gene_info') +params.ensembl_version = WorkflowMain.getGenomeAttribute(params, 'ensembl_version') WorkflowMain.initialise(workflow, params, log) diff --git a/modules.json b/modules.json index 2c0b137..9228792 100644 --- a/modules.json +++ b/modules.json @@ -10,12 +10,12 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, - "cat/fastq": { + "cat/cat": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, - "csvtk/concat": { + "cat/fastq": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] diff --git a/modules/local/add_flags/main.nf b/modules/local/add_flags/main.nf new file mode 100644 index 0000000..1e05dbc --- /dev/null +++ b/modules/local/add_flags/main.nf @@ -0,0 +1,37 @@ +process ADD_FLAG { + tag "$meta.id" + label "process_single" + +/// must be using singularity 3.7+ + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'ghcr.io/rocker-org/devcontainer/tidyverse:4' : + 'ghcr.io/rocker-org/devcontainer/tidyverse:4' }" + + input: + tuple val(meta), path(cluster), path(cis), path(cff), path(problem_chrom), path(filters) + + output: + tuple val(meta), path("*_metafusion_cluster.unfiltered.cff"), emit: unfiltered_cff + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def sample = "${meta.sample}" + """ + add_flags_and_cluster_information.R \\ + $cff \\ + $cluster \\ + $cis \\ + $problem_chrom \\ + $filters \\ + $sample + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + add_flags_and_cluster_information.R: 0.0.1 + END_VERSIONS + """ +} diff --git a/modules/local/agfusion/batch/main.nf b/modules/local/agfusion/batch/main.nf new file mode 100644 index 0000000..c4dd105 --- /dev/null +++ b/modules/local/agfusion/batch/main.nf @@ -0,0 +1,47 @@ +process AGFUSION_BATCH { + tag "$meta.id" + label 'process_low' + + // Note: 2.7X indices incompatible with AWS iGenomes. + conda 'bioconda::agfusion=1.252' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'cmopipeline/agfusion:0.0.5' : + 'cmopipeline/agfusion:0.0.5' }" + + input: + tuple val(meta), path(fusions) + path(agfusion_db) + path(pyensembl_cache) + + output: + tuple val(meta), path("${prefix}") , emit: fusions_annotated + tuple val(meta), path("${prefix}.fusion_transcripts.csv"), emit: fusion_transcripts_csv + tuple val(meta), path("${prefix}.fusion_transcripts.tsv"), emit: fusion_transcripts_tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + export PYENSEMBL_CACHE_DIR=\$PWD/${pyensembl_cache} + + awk -F"\\t" 'NR == 1 && \$1 ~ /gene5_chr/ {next;}{print}' ${fusions} > ${fusions}.no_header + + agfusion batch \\ + -f ${fusions}.no_header \\ + -db ${agfusion_db} \\ + -o ${prefix} \\ + ${args} + + awk -F"," 'NR != 1 && FNR == 1 {next;}{print}' ${prefix}/*/*.fusion_transcripts.csv > ${prefix}.fusion_transcripts.csv + cat ${prefix}.fusion_transcripts.csv | tr "," "\\t" > ${prefix}.fusion_transcripts.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agfusion: \$(agfusion -v) (fork) + END_VERSIONS + """ +} diff --git a/modules/local/agfusion/batch/meta.yml b/modules/local/agfusion/batch/meta.yml new file mode 100644 index 0000000..7961454 --- /dev/null +++ b/modules/local/agfusion/batch/meta.yml @@ -0,0 +1,49 @@ +name: agfusion_batch +description: AGFusion batch annotation +keywords: + - agfusion + - fusion + - batch + - frame +tools: + - batch: + description: Annotate a file containing fusions + homepage: https://github.com/anoronh4/AGFusion + documentation: https://github.com/anoronh4/AGFusion/blob/master/README.md + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fusions: + type: path + description: Fusions in one of many formats (may include arriba outputs or cff) + pattern: "*" + - agfusion_db: + type: path + description: File containing AGFusion reference information. + pattern: "*.db" + - pyensembl_cache: + type: path + description: Folder containing pyensembl cache. + pattern: "*" + +output: + - versions: + type: path + description: File containing software versions + pattern: "versions.yml" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - agfusion_result: + type: path + description: Folder containing annotations + pattern: "${prefix}/" + +authors: + - "@anoronh4" diff --git a/modules/local/agfusion/container/Dockerfile b/modules/local/agfusion/container/Dockerfile new file mode 100755 index 0000000..3c20dd9 --- /dev/null +++ b/modules/local/agfusion/container/Dockerfile @@ -0,0 +1,26 @@ +FROM ubuntu:bionic-20230530 + +LABEL maintainer="Anne Marie Noronha (noronhaa@mskcc.org)" \ + version.image="0.0.5" + +# INSTALL DEPENDENCIES + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update -y +RUN apt-get install -y build-essential python3 python3-pip python3-matplotlib python3-pandas python3-future python3-biopython curl less vim libnss-sss git zip +RUN pip3 install --upgrade pip +RUN pip3 install pyensembl + +# Additional libraries needed for AGFusion build command +RUN apt-get install -y default-libmysqlclient-dev +RUN pip3 install mysqlclient + +# INSTALL AGFUSION & DATABASE FILES +WORKDIR /usr/local/bin +RUN git clone https://github.com/anoronh4/AGFusion.git --branch feature/metafusion_parser +WORKDIR /usr/local/bin/AGFusion +RUN pip3 install . + +# downgrade pyensembl for compatibility +RUN pip3 install gtfparse==1.2.1 --upgrade diff --git a/modules/local/agfusion/download/main.nf b/modules/local/agfusion/download/main.nf new file mode 100644 index 0000000..5744641 --- /dev/null +++ b/modules/local/agfusion/download/main.nf @@ -0,0 +1,58 @@ +process AGFUSION_DOWNLOAD { + label 'process_low' + + // Note: 2.7X indices incompatible with AWS iGenomes. + conda 'bioconda::agfusion=1.252' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'cmopipeline/agfusion:0.0.5' : + 'cmopipeline/agfusion:0.0.5' }" + + input: + val(ensembl_release) + val(genome) + + output: + path "agfusion.*.db" , emit: agfusion_db + path "pyensembl_cache", emit: pyensembl_cache + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def agfusion_genome = ['GRCh37','smallGRCh37','hg19'].contains(genome) ? 'hg19' : + ['GRCh38','hg38'].contains(genome) ? 'hg38' : + ['GRCm38','mm10'].contains(genome) ? 'mm10' : '' + def pyensembl_species = ['GRCm38','mm10'].contains(genome) ? 'mus_musculus' : 'homo_sapiens' + if (ensembl_release < 93) { + """ + export PYENSEMBL_CACHE_DIR=\$PWD/pyensembl_cache + + pyensembl install --species ${pyensembl_species} --release ${ensembl_release} + + agfusion download -g ${agfusion_genome} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agfusion: \$(agfusion -v) (fork) + END_VERSIONS + """ + } else { + """ + export PYENSEMBL_CACHE_DIR=\$PWD/pyensembl_cache + + pyensembl install --species ${pyensembl_species} --release ${ensembl_release} + + curl http://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files/pfamA.txt.gz > pfamA.txt.gz + gunzip pfamA.txt.gz + agfusion build --dir . --species ${agfusion_genome} --release ${ensembl_release} --pfam pfamA.txt + rm pfamA.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agfusion: \$(agfusion -v) (fork) + END_VERSIONS + """ + } +} diff --git a/modules/local/agfusion/download/meta.yml b/modules/local/agfusion/download/meta.yml new file mode 100644 index 0000000..21a15a8 --- /dev/null +++ b/modules/local/agfusion/download/meta.yml @@ -0,0 +1,35 @@ +name: fusionreport_download +description: Build DB for fusionreport +keywords: + - sort +tools: + - fusioncatcher: + description: Build DB for fusionreport + homepage: https://github.com/ndaniel/fusioncatcher/ + documentation: https://github.com/ndaniel/fusioncatcher/blob/master/doc/manual.md + tool_dev_url: https://github.com/ndaniel/fusioncatcher/ + doi: "10.1101/011650" + licence: ["GPL v3"] + +input: + - username: + type: value + description: Organism for which the data is downloaded from Ensembl database and built + pattern: "*" + - passwd: + type: value + description: Organism for which the data is downloaded from Ensembl database and built + pattern: "*" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reference: + type: directory + description: directory containing the genome resource files required for fusioncatcher + pattern: "fusioncatcher-genome" + +authors: + - "@praveenraj2018" diff --git a/modules/local/cff_annotate/main.nf b/modules/local/cff_annotate/main.nf new file mode 100644 index 0000000..dc4a16b --- /dev/null +++ b/modules/local/cff_annotate/main.nf @@ -0,0 +1,35 @@ +process CFF_ANNOTATE { + tag "$meta.id" + label "process_single" + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'ghcr.io/rocker-org/devcontainer/tidyverse:4' : + 'ghcr.io/rocker-org/devcontainer/tidyverse:4' }" + + input: + tuple val(meta), path(cff), path(oncokb), path(agfusion) + + output: + tuple val(meta), path("${prefix}.unfiltered.cff"), emit: unfiltered_cff + tuple val(meta), path("${prefix}.final.cff") , emit: filtered_cff + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: "${cff}" + """ + add_annotations_cff.R \\ + --cff ${cff} \\ + --oncokb ${oncokb} \\ + --agfusion ${agfusion} \\ + --out-prefix ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + add_annotations_cff.R: 0.0.1 + END_VERSIONS + """ +} diff --git a/modules/local/convert_to_cff/main.nf b/modules/local/convert_to_cff/main.nf new file mode 100644 index 0000000..ec5ecf0 --- /dev/null +++ b/modules/local/convert_to_cff/main.nf @@ -0,0 +1,31 @@ +process TO_CFF { + tag "$meta.id" + label "process_single" + +// must be using singularity 3.7+ + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'ghcr.io/rocker-org/devcontainer/tidyverse:4' : + 'ghcr.io/rocker-org/devcontainer/tidyverse:4' }" + + input: + tuple val(meta), val(caller), path(fusions) + + output: + tuple val(meta), path("*.cff"), emit: cff + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def sample = "${meta.sample}" + """ + make_cff_from_forte.R $caller $fusions $sample ${sample}_${caller}.cff + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + make_cff_from_forte.R: 0.0.1 + END_VERSIONS + """ +} diff --git a/modules/local/metafusion/container/Dockerfile b/modules/local/metafusion/container/Dockerfile new file mode 100755 index 0000000..018cef8 --- /dev/null +++ b/modules/local/metafusion/container/Dockerfile @@ -0,0 +1,23 @@ +FROM mapostolides/metafusion + +LABEL author="Alexandria Dymun (pintoa1@mskcc.org)" \ + maintainer="Anne Marie Noronha (noronhaa@mskcc.org)" \ + version.image="0.0.6" + +ENV METAFUSION_TAG="v1.0.1" +ENV PATH="${PATH}:/MetaFusion/scripts" + +RUN R -e "chooseCRANmirror(ind=52); install.packages(c('plyr','data.table'))" + +RUN git clone -b $METAFUSION_TAG https://github.com/pintoa1-mskcc/MetaFusion.git --single-branch && \ + cd /MetaFusion/ && \ + git pull && \ + chmod -R +x /MetaFusion/scripts && \ + rm -rf /MetaFusion/test_data + +# run cleanup +RUN conda clean -afy \ + && find /opt/conda/ -follow -type f -name '*.a' -delete \ + && find /opt/conda/ -follow -type f -name '*.pyc' -delete \ + && find /opt/conda/ -follow -type f -name '*.js.map' -delete + diff --git a/modules/local/metafusion/main.nf b/modules/local/metafusion/main.nf new file mode 100644 index 0000000..e3e8120 --- /dev/null +++ b/modules/local/metafusion/main.nf @@ -0,0 +1,47 @@ +process METAFUSION { + tag "$meta.id" + label "process_low" + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'cmopipeline/metafusion:0.0.6' : + 'cmopipeline/metafusion:0.0.6' }" + + input: + tuple val(meta), path(cff) + path genebed + path info + path fasta + path blocklist + + output: + tuple val(meta), path("*final*cluster") , emit: cluster + tuple val(meta), path("*.exons") , emit: cff + tuple val(meta), path("cis-sage.cluster") , emit: cis + tuple val(meta), path("problematic_chromosomes.cff"), emit: problem_chrom + tuple val(meta), path("filters.txt") , emit: filters + // tuple val(meta), path("*") , emit: all + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + args = task.ext.args ?: "" + def sample = "${meta.sample}" + """ + Metafusion_forte.sh \\ + --cff $cff \\ + --outdir . \\ + --gene_bed $genebed \\ + --gene_info $info \\ + --genome_fasta $fasta \\ + --recurrent_bedpe $blocklist \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + Metafusion docker: \$METAFUSION_TAG + Metafusion_forte.sh: 0.0.1 + END_VERSIONS + """ +} diff --git a/modules/local/oncokb/fusionannotator/main.nf b/modules/local/oncokb/fusionannotator/main.nf index 16720e0..1a263c5 100644 --- a/modules/local/oncokb/fusionannotator/main.nf +++ b/modules/local/oncokb/fusionannotator/main.nf @@ -3,16 +3,16 @@ process ONCOKB_FUSIONANNOTATOR { label 'process_low' // Note: 2.7X indices incompatible with AWS iGenomes. - conda "shahcompbio::oncokb-annotator=2.3.3" + //conda "shahcompbio::oncokb-annotator=2.3.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'cmopipeline/oncokb-annotator:0.0.1' : 'cmopipeline/oncokb-annotator:0.0.1' }" input: - tuple val(meta), path(fusions) + tuple val(meta), path(cff) output: - tuple val(meta), path("*.oncokb.tsv"), emit: oncokb_fusions + tuple val(meta), path("*.oncokb.tsv") , emit: oncokb_fusions path "versions.yml" , emit: versions when: @@ -23,9 +23,10 @@ process ONCOKB_FUSIONANNOTATOR { def args2 = task.ext.args2 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ - awk 'BEGIN {FS=OFS="|"}{gsub("--","-",\$1)}1' ${fusions} > ${fusions}.reformat + awk 'BEGIN { FS = "\\t"; OFS = "\\t"} {print \$31,\$19"-"\$21}' ${cff} | tail -n+2 > ${cff}.reformat + echo -e "Tumor_Sample_Barcode\tFusion" | cat - ${cff}.reformat > ${cff}.1reformat FusionAnnotator.py \\ - -i ${fusions}.reformat \\ + -i ${cff}.1reformat \\ -o ${prefix}.oncokb.tsv \\ ${args} diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf new file mode 100644 index 0000000..9f06221 --- /dev/null +++ b/modules/nf-core/cat/cat/main.nf @@ -0,0 +1,62 @@ +process CAT_CAT { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::pigz=2.3.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : + 'biocontainers/pigz:2.3.4' }" + + input: + tuple val(meta), path(files_in) + + output: + tuple val(meta), path("${prefix}"), emit: file_out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def file_list = files_in.collect { it.toString() } + + // | input | output | command1 | command2 | + // |-----------|------------|----------|----------| + // | gzipped | gzipped | cat | | + // | ungzipped | ungzipped | cat | | + // | gzipped | ungzipped | zcat | | + // | ungzipped | gzipped | cat | pigz | + + // Use input file ending as default + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + out_zip = prefix.endsWith('.gz') + in_zip = file_list[0].endsWith('.gz') + command1 = (in_zip && !out_zip) ? 'zcat' : 'cat' + command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : '' + """ + $command1 \\ + $args \\ + ${file_list.join(' ')} \\ + $command2 \\ + > ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + + stub: + def file_list = files_in.collect { it.toString() } + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + """ + touch $prefix + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml new file mode 100644 index 0000000..8acc0bf --- /dev/null +++ b/modules/nf-core/cat/cat/meta.yml @@ -0,0 +1,37 @@ +name: cat_cat +description: A module for concatenation of gzipped or uncompressed files +keywords: + - concatenate + - gzip + - cat +tools: + - cat: + description: Just concatenation + + documentation: https://man7.org/linux/man-pages/man1/cat.1.html + + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - files_in: + type: file + description: List of compressed / uncompressed files + pattern: "*" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - file_out: + type: file + description: Concatenated file. Will be gzipped if file_out ends with ".gz" + pattern: "${file_out}" + +authors: + - "@erikrikarddaniel" + - "@FriederikeHanssen" diff --git a/modules/nf-core/csvtk/concat/main.nf b/modules/nf-core/csvtk/concat/main.nf deleted file mode 100644 index c9fb9bf..0000000 --- a/modules/nf-core/csvtk/concat/main.nf +++ /dev/null @@ -1,43 +0,0 @@ -process CSVTK_CONCAT { - tag "$meta.id" - label 'process_low' - - conda "bioconda::csvtk=0.23.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/csvtk:0.23.0--h9ee0642_0' : - 'quay.io/biocontainers/csvtk:0.23.0--h9ee0642_0' }" - - input: - tuple val(meta), path(csv) - val in_format - val out_format - - output: - tuple val(meta), path("${prefix}.${out_extension}"), emit: csv - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" - def delimiter = in_format == "tsv" ? "\t" : (in_format == "csv" ? "," : in_format) - def out_delimiter = out_format == "tsv" ? "\t" : (out_format == "csv" ? "," : out_format) - out_extension = out_format == "tsv" ? 'tsv' : 'csv' - """ - csvtk \\ - concat \\ - $args \\ - --num-cpus $task.cpus \\ - --delimiter "${delimiter}" \\ - --out-delimiter "${out_delimiter}" \\ - --out-file ${prefix}.${out_extension} \\ - $csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) - END_VERSIONS - """ -} diff --git a/modules/nf-core/csvtk/concat/meta.yml b/modules/nf-core/csvtk/concat/meta.yml deleted file mode 100644 index 2d2f856..0000000 --- a/modules/nf-core/csvtk/concat/meta.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: csvtk_concat -description: Concatenate two or more CSV (or TSV) tables into a single table -keywords: - - concatenate - - tsv - - csv -tools: - - csvtk: - description: A cross-platform, efficient, practical CSV/TSV toolkit - homepage: http://bioinf.shenwei.me/csvtk - documentation: http://bioinf.shenwei.me/csvtk - tool_dev_url: https://github.com/shenwei356/csvtk - doi: "" - licence: ["MIT"] - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - csv: - type: file - description: CSV/TSV formatted files - pattern: "*.{csv,tsv}" - - in_format: - type: string - description: Input format (csv, tab, or a delimiting character) - pattern: "*" - - out_format: - type: string - description: Output format (csv, tab, or a delimiting character) - pattern: "*" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "version.yml" - - csv: - type: file - description: Concatenated CSV/TSV file - pattern: "*.{csv,tsv}" - -authors: - - "@rpetit3" diff --git a/nextflow.config b/nextflow.config index 234d21b..5c026f9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -33,6 +33,7 @@ params { cosmic_usr = null fusion_report_cutoff = 1 skip_fusion_report = false + fusion_tool_cutoff = 1 // QC // rseqc_modules can include ['bam_stat','inner_distance','infer_experiment','junction_annotation','junction_saturation','read_distribution','read_duplication','tin'] diff --git a/subworkflows/local/fusion.nf b/subworkflows/local/fusion.nf index 4f045ee..af9c7cc 100644 --- a/subworkflows/local/fusion.nf +++ b/subworkflows/local/fusion.nf @@ -3,9 +3,15 @@ include { ARRIBA } from '../../modules/nf-core/arriba include { STAR_ALIGN as STAR_FOR_STARFUSION } from '../../modules/nf-core/star/align/main' include { STARFUSION } from '../../modules/local/starfusion/detect/main' include { FUSIONCATCHER_DETECT } from '../../modules/local/fusioncatcher/detect/main' -include { FUSIONREPORT } from '../../modules/local/fusionreport/run/main' include { ONCOKB_FUSIONANNOTATOR } from '../../modules/local/oncokb/fusionannotator/main' -include { CSVTK_CONCAT as CSV_TO_TSV } from '../../modules/nf-core/csvtk/concat/main' +include { AGFUSION_BATCH } from '../../modules/local/agfusion/batch/main' +include { TO_CFF as ARRIBA_TO_CFF } from '../../modules/local/convert_to_cff/main' +include { TO_CFF as FUSIONCATCHER_TO_CFF } from '../../modules/local/convert_to_cff/main' +include { TO_CFF as STARFUSION_TO_CFF } from '../../modules/local/convert_to_cff/main' +include { CAT_CAT as MERGE_CFF } from '../../modules/nf-core/cat/cat/main' +include { METAFUSION } from '../../modules/local/metafusion/main' +include { ADD_FLAG } from '../../modules/local/add_flags/main' +include { CFF_ANNOTATE as CFF_FINALIZE } from '../../modules/local/cff_annotate/main' workflow FUSION { @@ -16,10 +22,17 @@ workflow FUSION { starfusion_ref fusioncatcher_ref fusion_report_db + agfusion_db + pyensembl_cache + gene_bed + blocklist main: ch_versions = Channel.empty() fasta = params.fasta + //gene_bed = params.metafusion_gene_bed + gene_info = params.metafusion_gene_info + //blocklist = params.metafusion_blocklist STAR_FOR_ARRIBA( reads, @@ -70,42 +83,65 @@ workflow FUSION { fc_fusions = ["GRCh37","hg19","smallGRCh37"].contains(params.genome) ? FUSIONCATCHER_DETECT.out.fusions_alt : FUSIONCATCHER_DETECT.out.fusions + + ARRIBA_TO_CFF(ARRIBA.out.fusions + .map{ meta, file ->[ meta, "arriba", file ] }) + FUSIONCATCHER_TO_CFF(fc_fusions + .map{ meta, file -> [ meta, "fusioncatcher", file ] } ) + STARFUSION_TO_CFF(STARFUSION.out.abridged + .map{ meta, file -> [ meta, "starfusion", file ] }) // get expected number of callers for groupTuple numcallers = 1 + ( params.starfusion_url ? 1 : 0 ) + ( ["GRCh37","GRCh38"].contains(params.genome) ? 1 : 0 ) - FUSIONREPORT( - ARRIBA.out.fusions - .map{ meta, file ->[ meta, "arriba", file ] } + MERGE_CFF( + ARRIBA_TO_CFF.out.cff + .map{ meta, file -> [meta, file]} .mix( - fc_fusions - .map{ meta, file -> [ meta, "fusioncatcher", file ] } + FUSIONCATCHER_TO_CFF.out.cff + .map{ meta, file -> [meta, file]} ).mix( - STARFUSION.out.abridged - .map{ meta, file -> [ meta, "starfusion", file ] } - ) - .groupTuple(by:[0],size:numcallers) - .map{ meta, caller, file -> - def avg_weight = caller.collect({(100/caller.size()).toInteger()}) - avg_weight[-1] = avg_weight[-1] + (100-avg_weight.sum()) - [ meta, caller, avg_weight, file ] - }, - fusion_report_db + STARFUSION_TO_CFF.out.cff + .map{ meta, file -> [meta, file]} + ).groupTuple(by:[0],size:numcallers), ) - ch_versions = ch_versions.mix(FUSIONREPORT.out.versions.first()) - - CSV_TO_TSV( - FUSIONREPORT.out.fusionreport_csv - .map{ meta, csv -> - [meta, [csv]] - }, - "csv", - "tsv" + + METAFUSION( + MERGE_CFF.out.file_out, + gene_bed, + gene_info, + fasta, + blocklist + ) + + ADD_FLAG( + METAFUSION.out.cluster + .join(METAFUSION.out.cis, by:0) + .join(METAFUSION.out.cff, by:0) + .join(METAFUSION.out.problem_chrom, by:0) + .join(METAFUSION.out.filters, by:0) ) - ch_versions = ch_versions.mix(CSV_TO_TSV.out.versions.first()) - ONCOKB_FUSIONANNOTATOR(CSV_TO_TSV.out.csv) + ONCOKB_FUSIONANNOTATOR(ADD_FLAG.out.unfiltered_cff) ch_versions = ch_versions.mix(ONCOKB_FUSIONANNOTATOR.out.versions.first()) + AGFUSION_BATCH( + ADD_FLAG.out.unfiltered_cff, + agfusion_db, + pyensembl_cache + ) + ch_versions = ch_versions.mix(AGFUSION_BATCH.out.versions.first()) + + CFF_FINALIZE( + ADD_FLAG.out.unfiltered_cff + .join(ONCOKB_FUSIONANNOTATOR.out.oncokb_fusions, by:0) + .join(AGFUSION_BATCH.out.fusion_transcripts_tsv, by:0) + ) + ch_versions = ch_versions.mix(ADD_FLAG.out.versions.first()) + ch_versions = ch_versions.mix(METAFUSION.out.versions.first()) + ch_versions = ch_versions.mix(ARRIBA_TO_CFF.out.versions.first()) + ch_versions = ch_versions.mix(FUSIONCATCHER_TO_CFF.out.versions.first()) + ch_versions = ch_versions.mix(STARFUSION_TO_CFF.out.versions.first()) + emit: ch_versions } diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf index a45f70b..463daf6 100644 --- a/subworkflows/local/prepare_references.nf +++ b/subworkflows/local/prepare_references.nf @@ -5,11 +5,16 @@ include { GATK4_CREATESEQUENCEDICTIONARY } from '../../modules/nf-core/gatk4/cre include { SAMTOOLS_FAIDX } from '../../modules/nf-core/samtools/faidx/main' include { GATK4_BEDTOINTERVALLIST } from '../../modules/nf-core/gatk4/bedtointervallist/main' include { PREPARE_RRNA } from '../../modules/local/prepare_rrna/main' -include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { + GUNZIP as GUNZIP_GTF ; + GUNZIP as GUNZIP_METAFUSIONGENEBED ; + GUNZIP as GUNZIP_METAFUSIONBLOCKLIST +} from '../../modules/nf-core/gunzip/main' include { STARFUSION_DOWNLOAD } from '../../modules/local/starfusion/download/main' include { FUSIONCATCHER_DOWNLOAD } from '../../modules/local/fusioncatcher/download/main' include { FUSIONREPORT_DOWNLOAD } from '../../modules/local/fusionreport/download/main' include { KALLISTO_INDEX } from '../../modules/nf-core/kallisto/index/main' +include { AGFUSION_DOWNLOAD } from '../../modules/local/agfusion/download/main' workflow PREPARE_REFERENCES { @@ -18,12 +23,26 @@ workflow PREPARE_REFERENCES { ch_versions = Channel.empty() if (params.gtf.endsWith(".gz")){ - GUNZIP([[:],params.gtf]) - gtf = GUNZIP.out.gunzip.map{ it[1] }.first() + GUNZIP_GTF([[:],params.gtf]) + gtf = GUNZIP_GTF.out.gunzip.map{ it[1] }.first() } else { gtf = params.gtf } + if (params.metafusion_blocklist.endsWith(".gz")){ + GUNZIP_METAFUSIONBLOCKLIST([[:],params.metafusion_blocklist]) + metafusion_blocklist = GUNZIP_METAFUSIONBLOCKLIST.out.gunzip.map{ it[1] }.first() + } else { + metafusion_blocklist = params.metafusion_blocklist + } + + if (params.metafusion_gene_bed.endsWith(".gz")){ + GUNZIP_METAFUSIONGENEBED([[:],params.metafusion_gene_bed]) + metafusion_gene_bed = GUNZIP_METAFUSIONGENEBED.out.gunzip.map{ it[1] }.first() + } else { + metafusion_gene_bed = params.metafusion_gene_bed + } + STAR_GENOMEGENERATE(params.fasta,gtf) ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) star_index = STAR_GENOMEGENERATE.out.index @@ -72,6 +91,12 @@ workflow PREPARE_REFERENCES { //cosmic_passwd = params.cosmic_passwd ?: "" FUSIONREPORT_DOWNLOAD() + AGFUSION_DOWNLOAD( + params.ensembl_version, + params.genome + ) + ch_versions = ch_versions.mix(AGFUSION_DOWNLOAD.out.versions) + KALLISTO_INDEX(params.cdna) ch_versions = ch_versions.mix(KALLISTO_INDEX.out.versions) @@ -90,6 +115,10 @@ workflow PREPARE_REFERENCES { fusion_report_db = FUSIONREPORT_DOWNLOAD.out.reference rseqc_bed = UCSC_GENEPREDTOBED.out.bed.map{it[1]}.first() kallisto_index = KALLISTO_INDEX.out.idx + agfusion_db = AGFUSION_DOWNLOAD.out.agfusion_db + pyensembl_cache = AGFUSION_DOWNLOAD.out.pyensembl_cache + metafusion_blocklist = metafusion_blocklist + metafusion_gene_bed = metafusion_gene_bed ch_versions = ch_versions } diff --git a/tests/nextflow.config b/tests/nextflow.config index bab5342..fdd8833 100755 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -11,6 +11,10 @@ process { cpus = 2 memory = 3.GB time = 2.h + + withName: '.*_TO_CFF|GUNZIP_METAFUSION.*|AGFUSION.*' { + ext.when = { ! (workflow.profile.toString().split(",").contains(["singularity"]) && workflow.profile.toString().split(",").contains(["test"])) } + } } if ("$PROFILE" == "singularity") { diff --git a/workflows/forte.nf b/workflows/forte.nf index 23e9623..2569e4f 100644 --- a/workflows/forte.nf +++ b/workflows/forte.nf @@ -10,7 +10,7 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) WorkflowForte.initialise(params, log) // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.gtf, params.refflat, params.starfusion_url ] +def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.gtf, params.refflat, params.starfusion_url, params.metafusion_gene_bed, params.metafusion_blocklist, params.metafusion_gene_info ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters @@ -130,7 +130,11 @@ workflow FORTE { PREPARE_REFERENCES.out.gtf, PREPARE_REFERENCES.out.starfusion_ref, PREPARE_REFERENCES.out.fusioncatcher_ref, - PREPARE_REFERENCES.out.fusion_report_db + PREPARE_REFERENCES.out.fusion_report_db, + PREPARE_REFERENCES.out.agfusion_db, + PREPARE_REFERENCES.out.pyensembl_cache, + PREPARE_REFERENCES.out.metafusion_gene_bed, + PREPARE_REFERENCES.out.metafusion_blocklist ) ch_versions = ch_versions.mix(FUSION.out.ch_versions)