diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f94a9ae --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +Runing_hotspots.R diff --git a/README.md b/README.md index 3c7add8..9ef9823 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,9 @@ Install dependent packages (**data.table**, **IRanges**, **BSgenome.Hsapiens.UCS ``` install.packages("data.table") -source("http://bioconductor.org/biocLite.R") -biocLite("IRanges","BSgenome.Hsapiens.UCSC.hg19") +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") +BiocManager::install(c("IRanges","BSgenome.Hsapiens.UCSC.hg19")) ``` ####Usage: diff --git a/funcs.R b/funcs.R index 67220e0..58cee10 100644 --- a/funcs.R +++ b/funcs.R @@ -588,27 +588,45 @@ prepmaf=function(maf,expressiontb) { cat('Prepping MAF for analysis ...\n') #only non-indel, coding mutations - coding=c('initiator_codon_variant','missense_variant','splice_acceptor_variant', - 'splice_donor_variant','stop_gained','stop_lost','stop_retained_variant','synonymous_variant') - cat(' ... Reducing MAF to protein-coding substitutions\n') - maf=maf[ which(maf$Consequence %in% coding & maf$CANONICAL=='YES' & maf$BIOTYPE=='protein_coding'), ] - #remove indels - maf=maf[ which(!maf$Variant_Type%in%c('INS','DEL')), ] + + coding=c('initiator_codon_variant','missense_variant','splice_acceptor_variant', 'splice_donor_variant','stop_gained','stop_lost','stop_retained_variant','synonymous_variant','inframe_insertion',"inframe_deletion","frameshift_truncation") + + cat(' ... Reducing MAF to protein-coding substitutions\n') + maf <- maf[ which(maf$Consequence %in% coding & maf$CANONICAL=='YES' & maf$BIOTYPE=='protein_coding'), ] + + # remove indels + # maf=maf[ which(!maf$Variant_Type%in%c('INS','DEL')), ] # add additional annotations if(!'TUMORTYPE' %in% colnames(maf)) maf$TUMORTYPE='none' if(!'Master_ID' %in% colnames(maf)) maf$Master_ID=maf$Tumor_Sample_Barcode maf$Amino_Acid_Change=gsub('p.','',maf$HGVSp_Short) - maf$Amino_Acid_Position=unlist(lapply(maf$Protein_position,function(x) unlist(strsplit(x,"/"))[1] )) - maf$Protein_Length=as.numeric(unlist(lapply(maf$Protein_position,function(x) unlist(strsplit(x,'\\/'))[2]))) - maf$Reference_Amino_Acid=unlist(lapply(1:nrow(maf), function(x) unlist(strsplit(maf$Amino_acids[x],'/'))[1])) - maf$Variant_Amino_Acid=unlist(lapply(1:nrow(maf), function(x) unlist(strsplit(maf$Amino_acids[x],'/'))[2])) + + # maf$Amino_Acid_Position=unlist(lapply(maf$Protein_position,function(x) unlist(strsplit(x,"/"))[1] )) + source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/MutSiteFind.R') + maf$Amino_Acid_Position <- MutSiteFind(gsub("p.","",maf$HGVSp_Short)) + + # maf$Protein_Length=as.numeric(unlist(lapply(maf$Protein_position,function(x) unlist(strsplit(x,'\\/'))[2]))) + maf$Protein_Length <- rep(0) + maf$Protein_Length[maf$Hugo_Symbol=="EGFR"] <- 1210 + maf$Protein_Length[maf$Hugo_Symbol=="ERBB2"] <- 1255 + maf$Protein_Length[maf$Hugo_Symbol=="ERBB3"] <- 1342 + maf$Protein_Length[maf$Hugo_Symbol=="ERBB4"] <- 1308 + + # maf$Reference_Amino_Acid=unlist(lapply(1:nrow(maf), function(x) unlist(strsplit(maf$Amino_acids[x],'/'))[1])) + # maf$Variant_Amino_Acid=unlist(lapply(1:nrow(maf), function(x) unlist(strsplit(maf$Amino_acids[x],'/'))[2])) + mut <- gsub("p.","",maf$HGVSp_Short,fixed = T) + idx <- stringi::stri_locate_first_regex(str = mut, pattern = "[[:digit:]]+") + maf$Reference_Amino_Acid <- stringi::stri_sub(str = mut,from = 0,to = idx[,1]-1) + maf$Variant_Amino_Acid <- stringi::stri_sub(str = mut,from = idx[,2]+1) + rm(idx) + maf$allele_freq=maf$t_alt_count/(maf$t_alt_count+maf$t_ref_count) #subset splice_site mutations - splice=c('splice_acceptor_variant','splice_donor_variant') - ss=maf[ which(maf$Consequence %in% splice), ] - maf=maf[ which(!maf$Consequence %in% splice), ] + # splice=c('splice_acceptor_variant','splice_donor_variant') + # ss=maf[ which(maf$Consequence %in% splice), ] + # maf=maf[ which(!maf$Consequence %in% splice), ] cat(' ... Re-annotating substitutions\n') # annotating oligo mutations @@ -622,22 +640,23 @@ prepmaf=function(maf,expressiontb) { maf=maf[-i, ] maf=rbind(maf,tm) } + # annotating splice site mutations - ss=ss[ nchar(ss$Reference_Allele)<3, ] - ss$Amino_Acid_Position=as.numeric(ss$Start_Position) - ss$Reference_Amino_Acid='SS' - ss$Variant_Amino_Acid='SS' - out=c() - for(i in unique(ss$Chromosome)) { - tm=ss[ ss$Chromosome==i, ] - while(any.neighbors(tm)) { - neighbors=get.neighbors(tm) - tm$Amino_Acid_Position[ tm$Amino_Acid_Position %in% neighbors ]=min(neighbors) - } - out=rbind(out,tm) - } - ss=out - maf=rbind(maf,ss) + # ss=ss[ nchar(ss$Reference_Allele)<3, ] + # ss$Amino_Acid_Position=as.numeric(ss$Start_Position) + # ss$Reference_Amino_Acid='SS' + # ss$Variant_Amino_Acid='SS' + # out=c() + # for(i in unique(ss$Chromosome)) { + # tm=ss[ ss$Chromosome==i, ] + # while(any.neighbors(tm)) { + # neighbors=get.neighbors(tm) + # tm$Amino_Acid_Position[ tm$Amino_Acid_Position %in% neighbors ]=min(neighbors) + # } + # out=rbind(out,tm) + # } + # ss=out + # maf=rbind(maf,ss) # remove putative germline mutations cat(' ... Removing putative germline SNPs based on ExAC r0.2\n') @@ -648,6 +667,5 @@ prepmaf=function(maf,expressiontb) { maf=remove.unexpressed.mutations(maf,expressiontb) maf$Amino_Acid_Position=as.numeric(maf$Amino_Acid_Position) maf$Variant_Type='SNP' - return(maf) } diff --git a/hotspot_algo.R b/hotspot_algo.R index 737e5f8..be9cf9c 100755 --- a/hotspot_algo.R +++ b/hotspot_algo.R @@ -141,12 +141,11 @@ cat('\nReading in MAF...\n') d=read.csv(maf_fn,header=T,as.is=T,sep="\t",comment.char='#') d=prepmaf(d,expressiontb) - # writing temp files to annotate MAF with trinucleotides write.table(d,'___temp_maf.tm',row.names=F,quote=F,sep="\t") system(command='python make_trinuc_maf.py ___temp_maf.tm ___temp_maf-tri.tm') -# clean up tmp files d=read.csv('___temp_maf-tri.tm',header=T,as.is=T,sep="\t",comment.char='#') +# clean up tmp files system(command='rm ___t*') cat('\nFinished prepping MAF!\n') @@ -164,6 +163,7 @@ if(GENES_INTEREST) { } # run algorithm +d$Chromosome <- gsub("chr","",d$Chromosome) cat('Running algorithm...\n') output=lapply(genes,binom.test_snp) output=do.call('rbind',output) diff --git a/make_trinuc_maf.py b/make_trinuc_maf.py index 3921f60..334ee59 100644 --- a/make_trinuc_maf.py +++ b/make_trinuc_maf.py @@ -33,19 +33,19 @@ tmpf.write(join(map(lambda x: join(x, '\t'), lines), '\n')) tmpf.flush() + # Make bed file of regions print " ... Making bed file" grep_ps = subprocess.Popen(["grep", "-Ev", "^#|^Hugo", "___tmp.maf"], stdout=subprocess.PIPE) -cut_ps = subprocess.Popen("cut -f5-7".split(" "), stdin=grep_ps.stdout, stdout=subprocess.PIPE) +cut_ps = subprocess.Popen("cut -f4-6".split(" "), stdin=grep_ps.stdout, stdout=subprocess.PIPE) awk_ps = subprocess.Popen(["awk", "{OFS=\"\\t\"; print $1,$2-2,$3+1}"], stdin=cut_ps.stdout, stdout=subprocess.PIPE) bedf = open("___tmp.bed","w") bedf.write(awk_ps.communicate()[0]) bedf.close() - # Fetch regions print " ... Getting regions" -subprocess.call("bedtools getfasta -tab -fi /ifs/depot/assemblies/H.sapiens/GRCh37/gr37.fasta -bed ___tmp.bed -fo ___tmp.tsv".split(" ")) +subprocess.call("bedtools getfasta -tab -fi /Volumes/DATA/Seafile/Bioinfo-Lab-Data/Ref.Genome/hg19/hg19.fa -bed ___tmp.bed -fo ___tmp.tsv".split(" ")) # Add trinuc to lines print " ... Adding trinucs (normalized to start from C or T)" diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..d804824 --- /dev/null +++ b/run.sh @@ -0,0 +1,16 @@ +# To capture everything and then filter later on +./hotspot_algo.R \ + --input-maf=ERBBs/Data/ERBBS.maf \ + --rdata=hotspot_algo.Rdata \ + --output-file=ERBBs/sig_hotspots.txt \ + --homopolymer=FALSE \ + --qval=1 + +mv putative_false_positive_mutations.txt ERBBs/. + +# ./hotspot_algo.R \ +# --input-maf=ERBBs/Data/ERBBS.maf \ +# --rdata=hotspot_algo.Rdata \ +# --gene-query=ERBBs/Data/genes.txt \ +# --output-file=sig_hotspots.txt \ +# --homopolymer=FALSE \ No newline at end of file