-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #50 from mskcc/feature/make_cff
Metafusion/Fusion Annotation
- Loading branch information
Showing
31 changed files
with
1,508 additions
and
148 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
work/* | ||
runs/* | ||
logs/* | ||
*log | ||
.nextflow.log* | ||
.nextflow* | ||
results/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
#!/bin/bash | ||
#STEPS | ||
|
||
# __author__ = "Alexandria Dymun" | ||
# __email__ = "[email protected]" | ||
# __contributor__ = "Anne Marie Noronha ([email protected])" | ||
# __version__ = "0.0.1" | ||
# __status__ = "Dev" | ||
|
||
|
||
output_ANC_RT_SG=1 | ||
RT_call_filter=1 | ||
blck_filter=1 | ||
ANC_filter=1 | ||
usage() { | ||
echo "Usage: Metafusion_forte.sh --num_tools=<minNumToolsCalled> --genome_fasta <FASTA adds SEQ to fusion> --recurrent_bedpe <blacklistFusions> --outdir <outputDirectory> --cff <cffFile> --gene_bed <geneBedFile> --gene_info <geneInfoFile>" 1>&2; | ||
exit 1; | ||
} | ||
|
||
# Loop through arguments and process them | ||
while test $# -gt 0;do | ||
case $1 in | ||
-n=*|--num_tools=*) | ||
num_tools="${1#*=}" | ||
shift | ||
;; | ||
--outdir) | ||
outdir="$2" | ||
shift 2 | ||
;; | ||
--cff) | ||
cff="$2" | ||
shift 2 | ||
;; | ||
--gene_bed) | ||
gene_bed="$2" | ||
shift 2 | ||
;; | ||
--gene_info) | ||
gene_info="$2" | ||
shift 2 | ||
;; | ||
--genome_fasta) | ||
genome_fasta="$2" | ||
shift 2 | ||
;; | ||
--recurrent_bedpe) | ||
recurrent_bedpe="$2" | ||
shift 2 | ||
;; | ||
*) | ||
#OTHER_ARGUMENTS+=("$1") | ||
shift # Remove generic argument from processing | ||
;; | ||
esac | ||
done | ||
|
||
if [[ ! $cff || ! $gene_info || ! $gene_bed ]]; then | ||
echo "Missing required argument" | ||
usage | ||
fi | ||
|
||
|
||
mkdir $outdir | ||
|
||
#Check CFF file format: | ||
#Remove entries with nonconformming chromosome name | ||
|
||
all_gene_bed_chrs=`awk -F '\t' '{print $1}' $gene_bed | sort | uniq | sed 's/chr//g '` | ||
awk -F " " -v arr="${all_gene_bed_chrs[*]}" 'BEGIN{OFS = "\t"; split(arr,arr1); for(i in arr1) dict[arr1[i]]=""} $1 in dict && $4 in dict' $cff > $outdir/$(basename $cff).cleaned_chr | ||
grep -v -f $outdir/$(basename $cff).cleaned_chr $cff > problematic_chromosomes.cff | ||
cff=$outdir/$(basename $cff).cleaned_chr | ||
|
||
#Rename cff | ||
echo Rename cff | ||
rename_cff_file_genes.MetaFusion.py $cff $gene_info > $outdir/$(basename $cff).renamed | ||
cff=$outdir/$(basename $cff).renamed | ||
|
||
#Annotate cff | ||
if [ $genome_fasta ]; then | ||
echo Annotate cff, extract sequence surrounding breakpoint | ||
reann_cff_fusion.py --cff $cff --gene_bed $gene_bed --ref_fa $genome_fasta > $outdir/$(basename $cff).reann.WITH_SEQ | ||
else | ||
echo Annotate cff, no extraction of sequence surrounding breakpoint | ||
reann_cff_fusion.py --cff $cff --gene_bed $gene_bed > $outdir/$(basename $cff).reann.NO_SEQ | ||
fi | ||
|
||
# Assign .cff based on SEQ or NOSEQ | ||
if [ $genome_fasta ]; then | ||
cff=$outdir/$(basename $cff).reann.WITH_SEQ | ||
echo cff $cff | ||
else | ||
cff=$outdir/$(basename $cff).reann.NO_SEQ | ||
echo cff $cff | ||
fi | ||
|
||
echo Add adjacent exons to cff | ||
extract_closest_exons.py $cff $gene_bed $genome_fasta > $outdir/$(basename $cff).exons | ||
|
||
# assign cff as ".exons" if --annotate_exons flag was specified | ||
|
||
cff=$outdir/$(basename $cff).exons | ||
|
||
|
||
#Merge | ||
cluster=$outdir/$(basename $cff).cluster | ||
echo Merge cff by genes and breakpoints | ||
RUN_cluster_genes_breakpoints.sh $cff $outdir > $cluster | ||
|
||
#output ANC_RT_SG file | ||
if [ $output_ANC_RT_SG -eq 1 ]; then | ||
echo output cis-sage.cluster file | ||
output_ANC_RT_SG.py $cluster > $outdir/cis-sage.cluster | ||
fi | ||
|
||
#ReadThrough Callerfilter | ||
if [ $RT_call_filter -eq 1 ]; then | ||
echo ReadThrough, callerfilter $num_tools | ||
cat $cluster | grep ReadThrough > $outdir/$(basename $cluster).ReadThrough | ||
callerfilter_num.py --cluster $cluster --num_tools $num_tools > $outdir/$(basename $cluster).callerfilter.$num_tools | ||
callerfilter_excluded=$(comm -13 <(cut -f 22 $outdir/$(basename $cluster).callerfilter.$num_tools | sort | uniq) <(cut -f 22 $cluster | sort | uniq)) | ||
grep -v ReadThrough $outdir/$(basename $cluster).callerfilter.$num_tools > $outdir/$(basename $cluster).RT_filter.callerfilter.$num_tools | ||
cluster_RT_call=$outdir/$(basename $cluster).RT_filter.callerfilter.$num_tools | ||
fi | ||
# Blocklist Filter | ||
if [ $recurrent_bedpe ]; then | ||
echo blocklist filter | ||
blocklist_filter_recurrent_breakpoints.sh $cff $cluster_RT_call $outdir $recurrent_bedpe > $outdir/$(basename $cluster).RT_filter.callerfilter.$num_tools.blck_filter | ||
|
||
blocklist_cluster=$outdir/$(basename $cluster_RT_call).BLOCKLIST | ||
|
||
cluster=$outdir/$(basename $cluster).RT_filter.callerfilter.$num_tools.blck_filter | ||
fi | ||
# Adjacent Noncoding filter | ||
if [ $ANC_filter -eq 1 ]; then | ||
echo ANC adjacent noncoding filter | ||
filter_adjacent_noncoding.py $cluster > $outdir/$(basename $cluster).ANC_filter | ||
|
||
cluster=$outdir/$(basename $cluster).ANC_filter | ||
fi | ||
#Rank and generate final.cluster | ||
echo Rank and generate final.cluster | ||
rank_cluster_file.py $cluster > $outdir/final.n$num_tools.cluster | ||
cluster=$outdir/final.n$num_tools.cluster | ||
### Generate filtered FID file | ||
#out=`awk -F '\t' '{print $15}' $cluster | tail -n +2` | ||
#out2=`awk -F '\t' '{print $22}' $outdir/cis-sage.cluster | tail -n +2` | ||
#out3=`echo $out $out2` | ||
#echo ${out3//,/ } > out4 | ||
#out5=`tr ' ' '\n' < out4 | sort | uniq` | ||
|
||
#for this in $(echo $out5); do grep $this $cff; done >> $outdir/$(basename $cff).filtered.cff | ||
|
||
rm -f filters.txt | ||
cut -f 22 *.BLOCKLIST | tr "," "\n" | sort | uniq | sed "s/$/\tblocklist/g" > filters.txt | ||
cut -f 22 *.ANC_filter | tr "," "\n" | sort | uniq | sed "s/$/\tadjacent_noncoding/g" >> filters.txt | ||
cut -f 22 *.ReadThrough | tr "," "\n" | sort | uniq | sed "s/$/\tread_through/g" >> filters.txt | ||
echo -en "$callerfilter_excluded" | tr "," "\n" | sort | uniq | sed "s/$/\tcaller_filter/g" >> filters.txt | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
#!/usr/local/bin/Rscript | ||
# __author__ = "Anne Marie Noronha" | ||
# __email__ = "[email protected]" | ||
# __version__ = "0.0.1" | ||
|
||
|
||
suppressPackageStartupMessages({ | ||
library(dplyr) | ||
library(data.table) | ||
}) | ||
|
||
usage <- function() { | ||
message("Usage:") | ||
message("add_annotations_cff.R --cff-file <file.cff> --agfusion-file <agfusion.tsv> --oncokb-file <oncokb.tsv> --out-prefix <prefix>") | ||
} | ||
|
||
args = commandArgs(TRUE) | ||
|
||
if (is.null(args) | length(args)<1) { | ||
usage() | ||
quit() | ||
} | ||
|
||
#' Parse out options from a string without recourse to optparse | ||
#' | ||
#' @param x Long-form argument list like --opt1 val1 --opt2 val2 | ||
#' | ||
#' @return named list of options and values similar to optparse | ||
|
||
parse_args <- function(x){ | ||
args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] | ||
args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) | ||
# Ensure the option vectors are length 2 (key/ value) to catch empty ones | ||
args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) | ||
|
||
parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) gsub('-','_',x[1]))) | ||
parsed_args[! is.na(parsed_args)] | ||
} | ||
|
||
args_opt <- parse_args(paste(args,collapse=" ")) | ||
|
||
possible_args = c("cff", "oncokb", "agfusion", "out_prefix") | ||
if (length(setdiff(names(args_opt),possible_args)) > 0){ | ||
message("Invalid options") | ||
usage() | ||
quit() | ||
} | ||
|
||
if (length(setdiff(possible_args,names(args_opt))) > 0) { | ||
message("Missing required arguments") | ||
usage() | ||
quit() | ||
} | ||
|
||
oncokb_file = args_opt$oncokb | ||
agfusion_file = args_opt$agfusion | ||
cff_file = args_opt$cff | ||
out_prefix = args_opt$out_prefix | ||
|
||
cff = fread(cff_file) | ||
final_cff_cols <- c(names(cff)) | ||
agfusion_tab = fread(agfusion_file) %>% select(c(`5'_transcript`,`3'_transcript`,`5'_breakpoint`,`3'_breakpoint`,Fusion_effect)) | ||
final_cff_cols <- c(final_cff_cols,"Fusion_effect") | ||
if (!is.null(oncokb_file)){ | ||
oncokb_tab = fread(oncokb_file) %>% select(-Fusion) | ||
final_cff_cols = c(final_cff_cols,names(oncokb_tab %>% select(-Tumor_Sample_Barcode))) | ||
cff <- merge( | ||
cff, | ||
oncokb_tab, | ||
by.x ="FID", | ||
by.y = "Tumor_Sample_Barcode", | ||
all.x = T, | ||
all.y=F | ||
) | ||
} | ||
|
||
cff <- merge( | ||
cff, | ||
agfusion_tab, | ||
by.x = c("gene5_transcript_id","gene3_transcript_id","gene5_breakpoint","gene3_breakpoint"), | ||
by.y = c("5'_transcript","3'_transcript","5'_breakpoint","3'_breakpoint"), | ||
all.x = T, | ||
all.y = T | ||
) | ||
|
||
cff <- as.data.frame(cff)[,c(final_cff_cols)] | ||
#cff <- cff %>% mutate(!!final_cff_cols[34] := Fusion_effect) %>% select(-c(Fusion_effect)) | ||
|
||
write.table( | ||
cff, | ||
paste0(out_prefix, ".unfiltered.cff"), | ||
row.names = F, | ||
quote = F, | ||
sep = "\t", | ||
col.names = ! "V1" %in% final_cff_cols | ||
) | ||
|
||
filtered_cff <- cff %>% filter(! (is.na(cluster) | is.null(cluster) | cluster == "")) | ||
write.table( | ||
filtered_cff, | ||
paste0(out_prefix, ".final.cff"), | ||
row.names = F, | ||
append = F, | ||
quote = F, | ||
sep = "\t", | ||
col.names = ! "V1" %in% final_cff_cols | ||
) | ||
|
||
|
Oops, something went wrong.