Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update bioconductor installation #17

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Runing_hotspots.R
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ Install dependent packages (**data.table**, **IRanges**, **BSgenome.Hsapiens.UCS

```
install.packages("data.table")
source("http://bioconductor.org/biocLite.R")
biocLite("IRanges","BSgenome.Hsapiens.UCSC.hg19")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(c("IRanges","BSgenome.Hsapiens.UCSC.hg19"))
```

####Usage:
Expand Down
76 changes: 47 additions & 29 deletions funcs.R
Original file line number Diff line number Diff line change
Expand Up @@ -588,27 +588,45 @@ prepmaf=function(maf,expressiontb) {

cat('Prepping MAF for analysis ...\n')
#only non-indel, coding mutations
coding=c('initiator_codon_variant','missense_variant','splice_acceptor_variant',
'splice_donor_variant','stop_gained','stop_lost','stop_retained_variant','synonymous_variant')
cat(' ... Reducing MAF to protein-coding substitutions\n')
maf=maf[ which(maf$Consequence %in% coding & maf$CANONICAL=='YES' & maf$BIOTYPE=='protein_coding'), ]
#remove indels
maf=maf[ which(!maf$Variant_Type%in%c('INS','DEL')), ]

coding=c('initiator_codon_variant','missense_variant','splice_acceptor_variant', 'splice_donor_variant','stop_gained','stop_lost','stop_retained_variant','synonymous_variant','inframe_insertion',"inframe_deletion","frameshift_truncation")

cat(' ... Reducing MAF to protein-coding substitutions\n')
maf <- maf[ which(maf$Consequence %in% coding & maf$CANONICAL=='YES' & maf$BIOTYPE=='protein_coding'), ]

# remove indels
# maf=maf[ which(!maf$Variant_Type%in%c('INS','DEL')), ]

# add additional annotations
if(!'TUMORTYPE' %in% colnames(maf)) maf$TUMORTYPE='none'
if(!'Master_ID' %in% colnames(maf)) maf$Master_ID=maf$Tumor_Sample_Barcode
maf$Amino_Acid_Change=gsub('p.','',maf$HGVSp_Short)
maf$Amino_Acid_Position=unlist(lapply(maf$Protein_position,function(x) unlist(strsplit(x,"/"))[1] ))
maf$Protein_Length=as.numeric(unlist(lapply(maf$Protein_position,function(x) unlist(strsplit(x,'\\/'))[2])))
maf$Reference_Amino_Acid=unlist(lapply(1:nrow(maf), function(x) unlist(strsplit(maf$Amino_acids[x],'/'))[1]))
maf$Variant_Amino_Acid=unlist(lapply(1:nrow(maf), function(x) unlist(strsplit(maf$Amino_acids[x],'/'))[2]))

# maf$Amino_Acid_Position=unlist(lapply(maf$Protein_position,function(x) unlist(strsplit(x,"/"))[1] ))
source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/MutSiteFind.R')
maf$Amino_Acid_Position <- MutSiteFind(gsub("p.","",maf$HGVSp_Short))

# maf$Protein_Length=as.numeric(unlist(lapply(maf$Protein_position,function(x) unlist(strsplit(x,'\\/'))[2])))
maf$Protein_Length <- rep(0)
maf$Protein_Length[maf$Hugo_Symbol=="EGFR"] <- 1210
maf$Protein_Length[maf$Hugo_Symbol=="ERBB2"] <- 1255
maf$Protein_Length[maf$Hugo_Symbol=="ERBB3"] <- 1342
maf$Protein_Length[maf$Hugo_Symbol=="ERBB4"] <- 1308

# maf$Reference_Amino_Acid=unlist(lapply(1:nrow(maf), function(x) unlist(strsplit(maf$Amino_acids[x],'/'))[1]))
# maf$Variant_Amino_Acid=unlist(lapply(1:nrow(maf), function(x) unlist(strsplit(maf$Amino_acids[x],'/'))[2]))
mut <- gsub("p.","",maf$HGVSp_Short,fixed = T)
idx <- stringi::stri_locate_first_regex(str = mut, pattern = "[[:digit:]]+")
maf$Reference_Amino_Acid <- stringi::stri_sub(str = mut,from = 0,to = idx[,1]-1)
maf$Variant_Amino_Acid <- stringi::stri_sub(str = mut,from = idx[,2]+1)
rm(idx)

maf$allele_freq=maf$t_alt_count/(maf$t_alt_count+maf$t_ref_count)

#subset splice_site mutations
splice=c('splice_acceptor_variant','splice_donor_variant')
ss=maf[ which(maf$Consequence %in% splice), ]
maf=maf[ which(!maf$Consequence %in% splice), ]
# splice=c('splice_acceptor_variant','splice_donor_variant')
# ss=maf[ which(maf$Consequence %in% splice), ]
# maf=maf[ which(!maf$Consequence %in% splice), ]

cat(' ... Re-annotating substitutions\n')
# annotating oligo mutations
Expand All @@ -622,22 +640,23 @@ prepmaf=function(maf,expressiontb) {
maf=maf[-i, ]
maf=rbind(maf,tm)
}

# annotating splice site mutations
ss=ss[ nchar(ss$Reference_Allele)<3, ]
ss$Amino_Acid_Position=as.numeric(ss$Start_Position)
ss$Reference_Amino_Acid='SS'
ss$Variant_Amino_Acid='SS'
out=c()
for(i in unique(ss$Chromosome)) {
tm=ss[ ss$Chromosome==i, ]
while(any.neighbors(tm)) {
neighbors=get.neighbors(tm)
tm$Amino_Acid_Position[ tm$Amino_Acid_Position %in% neighbors ]=min(neighbors)
}
out=rbind(out,tm)
}
ss=out
maf=rbind(maf,ss)
# ss=ss[ nchar(ss$Reference_Allele)<3, ]
# ss$Amino_Acid_Position=as.numeric(ss$Start_Position)
# ss$Reference_Amino_Acid='SS'
# ss$Variant_Amino_Acid='SS'
# out=c()
# for(i in unique(ss$Chromosome)) {
# tm=ss[ ss$Chromosome==i, ]
# while(any.neighbors(tm)) {
# neighbors=get.neighbors(tm)
# tm$Amino_Acid_Position[ tm$Amino_Acid_Position %in% neighbors ]=min(neighbors)
# }
# out=rbind(out,tm)
# }
# ss=out
# maf=rbind(maf,ss)

# remove putative germline mutations
cat(' ... Removing putative germline SNPs based on ExAC r0.2\n')
Expand All @@ -648,6 +667,5 @@ prepmaf=function(maf,expressiontb) {
maf=remove.unexpressed.mutations(maf,expressiontb)
maf$Amino_Acid_Position=as.numeric(maf$Amino_Acid_Position)
maf$Variant_Type='SNP'

return(maf)
}
4 changes: 2 additions & 2 deletions hotspot_algo.R
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,11 @@ cat('\nReading in MAF...\n')
d=read.csv(maf_fn,header=T,as.is=T,sep="\t",comment.char='#')
d=prepmaf(d,expressiontb)


# writing temp files to annotate MAF with trinucleotides
write.table(d,'___temp_maf.tm',row.names=F,quote=F,sep="\t")
system(command='python make_trinuc_maf.py ___temp_maf.tm ___temp_maf-tri.tm')
# clean up tmp files
d=read.csv('___temp_maf-tri.tm',header=T,as.is=T,sep="\t",comment.char='#')
# clean up tmp files
system(command='rm ___t*')
cat('\nFinished prepping MAF!\n')

Expand All @@ -164,6 +163,7 @@ if(GENES_INTEREST) {
}

# run algorithm
d$Chromosome <- gsub("chr","",d$Chromosome)
cat('Running algorithm...\n')
output=lapply(genes,binom.test_snp)
output=do.call('rbind',output)
Expand Down
6 changes: 3 additions & 3 deletions make_trinuc_maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,19 @@
tmpf.write(join(map(lambda x: join(x, '\t'), lines), '\n'))
tmpf.flush()


# Make bed file of regions
print " ... Making bed file"
grep_ps = subprocess.Popen(["grep", "-Ev", "^#|^Hugo", "___tmp.maf"], stdout=subprocess.PIPE)
cut_ps = subprocess.Popen("cut -f5-7".split(" "), stdin=grep_ps.stdout, stdout=subprocess.PIPE)
cut_ps = subprocess.Popen("cut -f4-6".split(" "), stdin=grep_ps.stdout, stdout=subprocess.PIPE)
awk_ps = subprocess.Popen(["awk", "{OFS=\"\\t\"; print $1,$2-2,$3+1}"], stdin=cut_ps.stdout, stdout=subprocess.PIPE)
bedf = open("___tmp.bed","w")
bedf.write(awk_ps.communicate()[0])
bedf.close()


# Fetch regions
print " ... Getting regions"
subprocess.call("bedtools getfasta -tab -fi /ifs/depot/assemblies/H.sapiens/GRCh37/gr37.fasta -bed ___tmp.bed -fo ___tmp.tsv".split(" "))
subprocess.call("bedtools getfasta -tab -fi /Volumes/DATA/Seafile/Bioinfo-Lab-Data/Ref.Genome/hg19/hg19.fa -bed ___tmp.bed -fo ___tmp.tsv".split(" "))

# Add trinuc to lines
print " ... Adding trinucs (normalized to start from C or T)"
Expand Down
16 changes: 16 additions & 0 deletions run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# To capture everything and then filter later on
./hotspot_algo.R \
--input-maf=ERBBs/Data/ERBBS.maf \
--rdata=hotspot_algo.Rdata \
--output-file=ERBBs/sig_hotspots.txt \
--homopolymer=FALSE \
--qval=1

mv putative_false_positive_mutations.txt ERBBs/.

# ./hotspot_algo.R \
# --input-maf=ERBBs/Data/ERBBS.maf \
# --rdata=hotspot_algo.Rdata \
# --gene-query=ERBBs/Data/genes.txt \
# --output-file=sig_hotspots.txt \
# --homopolymer=FALSE