Skip to content

Commit

Permalink
Merge pull request #50 from mskcc/feature/make_cff
Browse files Browse the repository at this point in the history
Metafusion/Fusion Annotation
  • Loading branch information
anoronh4 authored Aug 18, 2023
2 parents b53bb98 + 2fdbb70 commit 56fad45
Show file tree
Hide file tree
Showing 31 changed files with 1,508 additions and 148 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
work/*
runs/*
logs/*
*log
.nextflow.log*
.nextflow*
results/*
159 changes: 159 additions & 0 deletions bin/Metafusion_forte.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#!/bin/bash
#STEPS

# __author__ = "Alexandria Dymun"
# __email__ = "[email protected]"
# __contributor__ = "Anne Marie Noronha ([email protected])"
# __version__ = "0.0.1"
# __status__ = "Dev"


output_ANC_RT_SG=1
RT_call_filter=1
blck_filter=1
ANC_filter=1
usage() {
echo "Usage: Metafusion_forte.sh --num_tools=<minNumToolsCalled> --genome_fasta <FASTA adds SEQ to fusion> --recurrent_bedpe <blacklistFusions> --outdir <outputDirectory> --cff <cffFile> --gene_bed <geneBedFile> --gene_info <geneInfoFile>" 1>&2;
exit 1;
}

# Loop through arguments and process them
while test $# -gt 0;do
case $1 in
-n=*|--num_tools=*)
num_tools="${1#*=}"
shift
;;
--outdir)
outdir="$2"
shift 2
;;
--cff)
cff="$2"
shift 2
;;
--gene_bed)
gene_bed="$2"
shift 2
;;
--gene_info)
gene_info="$2"
shift 2
;;
--genome_fasta)
genome_fasta="$2"
shift 2
;;
--recurrent_bedpe)
recurrent_bedpe="$2"
shift 2
;;
*)
#OTHER_ARGUMENTS+=("$1")
shift # Remove generic argument from processing
;;
esac
done

if [[ ! $cff || ! $gene_info || ! $gene_bed ]]; then
echo "Missing required argument"
usage
fi


mkdir $outdir

#Check CFF file format:
#Remove entries with nonconformming chromosome name

all_gene_bed_chrs=`awk -F '\t' '{print $1}' $gene_bed | sort | uniq | sed 's/chr//g '`
awk -F " " -v arr="${all_gene_bed_chrs[*]}" 'BEGIN{OFS = "\t"; split(arr,arr1); for(i in arr1) dict[arr1[i]]=""} $1 in dict && $4 in dict' $cff > $outdir/$(basename $cff).cleaned_chr
grep -v -f $outdir/$(basename $cff).cleaned_chr $cff > problematic_chromosomes.cff
cff=$outdir/$(basename $cff).cleaned_chr

#Rename cff
echo Rename cff
rename_cff_file_genes.MetaFusion.py $cff $gene_info > $outdir/$(basename $cff).renamed
cff=$outdir/$(basename $cff).renamed

#Annotate cff
if [ $genome_fasta ]; then
echo Annotate cff, extract sequence surrounding breakpoint
reann_cff_fusion.py --cff $cff --gene_bed $gene_bed --ref_fa $genome_fasta > $outdir/$(basename $cff).reann.WITH_SEQ
else
echo Annotate cff, no extraction of sequence surrounding breakpoint
reann_cff_fusion.py --cff $cff --gene_bed $gene_bed > $outdir/$(basename $cff).reann.NO_SEQ
fi

# Assign .cff based on SEQ or NOSEQ
if [ $genome_fasta ]; then
cff=$outdir/$(basename $cff).reann.WITH_SEQ
echo cff $cff
else
cff=$outdir/$(basename $cff).reann.NO_SEQ
echo cff $cff
fi

echo Add adjacent exons to cff
extract_closest_exons.py $cff $gene_bed $genome_fasta > $outdir/$(basename $cff).exons

# assign cff as ".exons" if --annotate_exons flag was specified

cff=$outdir/$(basename $cff).exons


#Merge
cluster=$outdir/$(basename $cff).cluster
echo Merge cff by genes and breakpoints
RUN_cluster_genes_breakpoints.sh $cff $outdir > $cluster

#output ANC_RT_SG file
if [ $output_ANC_RT_SG -eq 1 ]; then
echo output cis-sage.cluster file
output_ANC_RT_SG.py $cluster > $outdir/cis-sage.cluster
fi

#ReadThrough Callerfilter
if [ $RT_call_filter -eq 1 ]; then
echo ReadThrough, callerfilter $num_tools
cat $cluster | grep ReadThrough > $outdir/$(basename $cluster).ReadThrough
callerfilter_num.py --cluster $cluster --num_tools $num_tools > $outdir/$(basename $cluster).callerfilter.$num_tools
callerfilter_excluded=$(comm -13 <(cut -f 22 $outdir/$(basename $cluster).callerfilter.$num_tools | sort | uniq) <(cut -f 22 $cluster | sort | uniq))
grep -v ReadThrough $outdir/$(basename $cluster).callerfilter.$num_tools > $outdir/$(basename $cluster).RT_filter.callerfilter.$num_tools
cluster_RT_call=$outdir/$(basename $cluster).RT_filter.callerfilter.$num_tools
fi
# Blocklist Filter
if [ $recurrent_bedpe ]; then
echo blocklist filter
blocklist_filter_recurrent_breakpoints.sh $cff $cluster_RT_call $outdir $recurrent_bedpe > $outdir/$(basename $cluster).RT_filter.callerfilter.$num_tools.blck_filter

blocklist_cluster=$outdir/$(basename $cluster_RT_call).BLOCKLIST

cluster=$outdir/$(basename $cluster).RT_filter.callerfilter.$num_tools.blck_filter
fi
# Adjacent Noncoding filter
if [ $ANC_filter -eq 1 ]; then
echo ANC adjacent noncoding filter
filter_adjacent_noncoding.py $cluster > $outdir/$(basename $cluster).ANC_filter

cluster=$outdir/$(basename $cluster).ANC_filter
fi
#Rank and generate final.cluster
echo Rank and generate final.cluster
rank_cluster_file.py $cluster > $outdir/final.n$num_tools.cluster
cluster=$outdir/final.n$num_tools.cluster
### Generate filtered FID file
#out=`awk -F '\t' '{print $15}' $cluster | tail -n +2`
#out2=`awk -F '\t' '{print $22}' $outdir/cis-sage.cluster | tail -n +2`
#out3=`echo $out $out2`
#echo ${out3//,/ } > out4
#out5=`tr ' ' '\n' < out4 | sort | uniq`

#for this in $(echo $out5); do grep $this $cff; done >> $outdir/$(basename $cff).filtered.cff

rm -f filters.txt
cut -f 22 *.BLOCKLIST | tr "," "\n" | sort | uniq | sed "s/$/\tblocklist/g" > filters.txt
cut -f 22 *.ANC_filter | tr "," "\n" | sort | uniq | sed "s/$/\tadjacent_noncoding/g" >> filters.txt
cut -f 22 *.ReadThrough | tr "," "\n" | sort | uniq | sed "s/$/\tread_through/g" >> filters.txt
echo -en "$callerfilter_excluded" | tr "," "\n" | sort | uniq | sed "s/$/\tcaller_filter/g" >> filters.txt

109 changes: 109 additions & 0 deletions bin/add_annotations_cff.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/local/bin/Rscript
# __author__ = "Anne Marie Noronha"
# __email__ = "[email protected]"
# __version__ = "0.0.1"


suppressPackageStartupMessages({
library(dplyr)
library(data.table)
})

usage <- function() {
message("Usage:")
message("add_annotations_cff.R --cff-file <file.cff> --agfusion-file <agfusion.tsv> --oncokb-file <oncokb.tsv> --out-prefix <prefix>")
}

args = commandArgs(TRUE)

if (is.null(args) | length(args)<1) {
usage()
quit()
}

#' Parse out options from a string without recourse to optparse
#'
#' @param x Long-form argument list like --opt1 val1 --opt2 val2
#'
#' @return named list of options and values similar to optparse

parse_args <- function(x){
args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1]
args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE))
# Ensure the option vectors are length 2 (key/ value) to catch empty ones
args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z})

parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) gsub('-','_',x[1])))
parsed_args[! is.na(parsed_args)]
}

args_opt <- parse_args(paste(args,collapse=" "))

possible_args = c("cff", "oncokb", "agfusion", "out_prefix")
if (length(setdiff(names(args_opt),possible_args)) > 0){
message("Invalid options")
usage()
quit()
}

if (length(setdiff(possible_args,names(args_opt))) > 0) {
message("Missing required arguments")
usage()
quit()
}

oncokb_file = args_opt$oncokb
agfusion_file = args_opt$agfusion
cff_file = args_opt$cff
out_prefix = args_opt$out_prefix

cff = fread(cff_file)
final_cff_cols <- c(names(cff))
agfusion_tab = fread(agfusion_file) %>% select(c(`5'_transcript`,`3'_transcript`,`5'_breakpoint`,`3'_breakpoint`,Fusion_effect))
final_cff_cols <- c(final_cff_cols,"Fusion_effect")
if (!is.null(oncokb_file)){
oncokb_tab = fread(oncokb_file) %>% select(-Fusion)
final_cff_cols = c(final_cff_cols,names(oncokb_tab %>% select(-Tumor_Sample_Barcode)))
cff <- merge(
cff,
oncokb_tab,
by.x ="FID",
by.y = "Tumor_Sample_Barcode",
all.x = T,
all.y=F
)
}

cff <- merge(
cff,
agfusion_tab,
by.x = c("gene5_transcript_id","gene3_transcript_id","gene5_breakpoint","gene3_breakpoint"),
by.y = c("5'_transcript","3'_transcript","5'_breakpoint","3'_breakpoint"),
all.x = T,
all.y = T
)

cff <- as.data.frame(cff)[,c(final_cff_cols)]
#cff <- cff %>% mutate(!!final_cff_cols[34] := Fusion_effect) %>% select(-c(Fusion_effect))

write.table(
cff,
paste0(out_prefix, ".unfiltered.cff"),
row.names = F,
quote = F,
sep = "\t",
col.names = ! "V1" %in% final_cff_cols
)

filtered_cff <- cff %>% filter(! (is.na(cluster) | is.null(cluster) | cluster == ""))
write.table(
filtered_cff,
paste0(out_prefix, ".final.cff"),
row.names = F,
append = F,
quote = F,
sep = "\t",
col.names = ! "V1" %in% final_cff_cols
)


Loading

0 comments on commit 56fad45

Please sign in to comment.