Rhistory

install.packages("ggplot2")
install.packages("plyr")
install.packages("dplyr")
setwd(dir = "/Users/mszucs/Dropbox/To_Pass_Over/")
peptide = read.csv(file = "NgKD_Protein_Peptide_Comparison_Filtered2_060116.csv")
source('C:/Users/mszucs/Dropbox/To_Pass_Over/NgKD_Protein_Phosphosite_Work_071916.R')
pepFilter = function(pepdf,species = "MOUSE"){
# Takes a protein peptide comparison dataframe and filters it by species
#
#   Input:
#        pepdf = Protein peptide comparison dataframe in the .csv format
#   Output:
#        pepdf.filter = Filtered protein peptide comparison dataframe
pepdf.filter = pepdf[which(pepdf$species == species),]
numRatioIndex = grep("_numRatios_",names(pepdf.filter))
return(pepdf.filter)
}
pepSeqFix = function(pepdf){
# Takes the filtered protein peptide comparison dataframe that was previously filtered and generates an All Caps sequence column,
# and an all caps seqeunce column concatinated with the parent charge of that peptide which is used as the unique identifier for the
#
# cyber T input
#   Input:
#      pepdf = Filtered peptide protein comparison dataframe
#   Output:
#      pepdf = The original pepdf input with the extra columns added to the end of it
pep.seq = grep("^sequence$",names(pepdf)) # Finds the sequence column
pepdf$sequence_capital = toupper(as.character(pepdf[[pep.seq]])) #Turns the sequence column into all capital letters
pepdf$sequence_length = nchar(as.character(pepdf[[pep.seq]])) #generates the sequence length
pepdf$sequence_charge = paste(pepdf$sequence_capital,"_",pepdf$z,"_",pepdf$protein_group_num,sep="") # Takes the uppercase sequence and the parent charge
return(pepdf)
}
pepClean = function(pep, seq = FALSE){
# Takes the peptide report with the newly created unqiue identifier and then it makes a smaller dataframe which will then be used
# for the protein roll up based on the unique identifier.
#   Input:
#     pep = cleaned peptide dataframe with the unique ID column of sequence_chargestate_groupnum
#   Output:
#     pep.parsed = shortened peptide dataframe.
if(seq == TRUE ){
index = grep("^sequence_capital$|^score$|^TMT_126$|^TMT_127$|^TMT_128$|^TMT_129$|^TMT_130$|^TMT_131$|^deltaForwardReverseScore$|
|^sequence_length$|^delta_parent_mass$",names(pep))
} else {
index = grep("^accession_number$|^score$|^TMT_126$|^TMT_127$|^TMT_128$|^TMT_129$|^TMT_130$|^TMT_131$|^deltaForwardReverseScore$|
|^sequence_length$|^delta_parent_mass$",names(pep))
}
pep.parsed = pep[,index]
pep.parsed = pep.parsed[complete.cases(pep.parsed),]
return(pep.parsed)
}
pepGroupSummary = function(pepdf.parsed){
# Takes the clean peptide dataframe with the unqiue identifier column and all of the other columns as
# integers and create a  dataframe which takes the mean value of the raw reporter ion intensity for a given
# protein based on the on the peptides. It also gives the standard deviation of those values
# a filtereing step is applied and any protein that has less than 2 peptides will be removed from the dataframe.
#   Input:
#      pepdf.parsed = shortened cleaned peptide dataframe with single unique identifier
#   Output:
#     pep.summary.filter = summary report of the rolled up protein list for the peptides
pep.summary = ddply(pepdf.parsed, .(accession_number), summarize, freq = length(accession_number),
score_mean = signif(mean(score),3), score_sd = signif(sd(score),3),
TMT_126_mean = signif(mean(TMT_126),3),TMT_126_sd = signif(sd(TMT_126),3),
TMT_127_mean = signif(mean(TMT_127),3),TMT_127_sd = signif(sd(TMT_127),3),
TMT_128_mean = signif(mean(TMT_128),3),TMT_128_sd = signif(sd(TMT_128),3),
TMT_129_mean = signif(mean(TMT_129),3),TMT_126_sd = signif(sd(TMT_129),3),
TMT_130_mean = signif(mean(TMT_130),3),TMT_130_sd = signif(sd(TMT_130),3),
TMT_131_mean = signif(mean(TMT_131),3),TMT_131_sd = signif(sd(TMT_131),3),
delta_parent_mass_mean = signif(mean(delta_parent_mass),3),delta_parent_mass_sd = signif(sd(delta_parent_mass),3),
deltaForwardReverseScore_mean = signif(mean(deltaForwardReverseScore),3),deltaForwardReverseScore_mean = signif(sd(deltaForwardReverseScore),3),
sequence_length = signif(mean(sequence_length),3), sequence_length_sd = signif(sd(sequence_length),3))
pep.summary.filter = pep.summary[which(pep.summary$freq >=2),]
return(pep.summary.filter)
}
#############################################################################################################
# Functions that work with the cyberT output and characterization of significant peptides
#
#############################################################################################################
setwd("/Users/mszucs/Desktop/To_Pass_Over/")
c.Out = read.delim(file = "NgKD_Cyber_T_test_Results_061316.txt")
wd
ws()
checkwd
setwd(dir = "/Users/mszucs/Dropbox/To_Pass_Over/")
d = pepFilter(peptide)
pepSeqFix = function(pepdf){
# Takes the filtered protein peptide comparison dataframe that was previously filtered and generates an All Caps sequence column,
# and an all caps seqeunce column concatinated with the parent charge of that peptide which is used as the unique identifier for the
#
# cyber T input
#   Input:
#      pepdf = Filtered peptide protein comparison dataframe
#   Output:
#      pepdf = The original pepdf input with the extra columns added to the end of it
pep.seq = grep("^sequence$",names(pepdf)) # Finds the sequence column
pepdf$sequence_capital = toupper(as.character(pepdf[[pep.seq]])) #Turns the sequence column into all capital letters
pepdf$sequence_length = nchar(as.character(pepdf[[pep.seq]])) #generates the sequence length
pepdf$sequence_charge = paste(pepdf$sequence_capital,"_",pepdf$z,"_",pepdf$protein_group_num,sep="") # Takes the uppercase sequence and the parent charge
return(pepdf)
}
d1 = pepSeqFix(d)
View(d1)
pepClean = function(pep, seq = FALSE){
# Takes the peptide report with the newly created unqiue identifier and then it makes a smaller dataframe which will then be used
# for the protein roll up based on the unique identifier.
#   Input:
#     pep = cleaned peptide dataframe with the unique ID column of sequence_chargestate_groupnum
#   Output:
#     pep.parsed = shortened peptide dataframe.
if(seq == TRUE ){
index = grep("^sequence_capital$|^score$|^TMT_126$|^TMT_127$|^TMT_128$|^TMT_129$|^TMT_130$|^TMT_131$|^deltaForwardReverseScore$|
|^sequence_length$|^delta_parent_mass$",names(pep))
} else {
index = grep("^accession_number$|^score$|^TMT_126$|^TMT_127$|^TMT_128$|^TMT_129$|^TMT_130$|^TMT_131$|^deltaForwardReverseScore$|
|^sequence_length$|^delta_parent_mass$",names(pep))
}
pep.parsed = pep[,index]
pep.parsed = pep.parsed[complete.cases(pep.parsed),]
return(pep.parsed)
}
cleaned_d1 = pepClean(d1)
View(cleaned_d1)
View(d1)
View(peptide)
pepClean = function(pep, seq = FALSE){
# Takes the peptide report with the newly created unqiue identifier and then it makes a smaller dataframe which will then be used
# for the protein roll up based on the unique identifier.
#   Input:
#     pep = cleaned peptide dataframe with the unique ID column of sequence_chargestate_groupnum
#   Output:
#     pep.parsed = shortened peptide dataframe.
if(seq == TRUE ){
index = grep("^sequence_capital$|^score$|^TMT_126_total$|^TMT_127_total$|^TMT_128_total$|^TMT_129_total$|^TMT_130_total$|^TMT_131_total$|^deltaForwardReverseScore$|
|^sequence_length$|^delta_parent_mass$",names(pep))
} else {
index = grep("^accession_number$|^score$|^TMT_126$|^TMT_127$|^TMT_128$|^TMT_129$|^TMT_130$|^TMT_131$|^deltaForwardReverseScore$|
|^sequence_length$|^delta_parent_mass$",names(pep))
}
pep.parsed = pep[,index]
pep.parsed = pep.parsed[complete.cases(pep.parsed),]
return(pep.parsed)
}
cleaned_d1 = pepClean(d1)
View(d1)
pepClean = function(pep, seq = FALSE){
# Takes the peptide report with the newly created unqiue identifier and then it makes a smaller dataframe which will then be used
# for the protein roll up based on the unique identifier.
#   Input:
#     pep = cleaned peptide dataframe with the unique ID column of sequence_chargestate_groupnum
#   Output:
#     pep.parsed = shortened peptide dataframe.
if(seq == TRUE ){
index = grep("^sequence_capital$|^score$|^TMT_126_total$|^TMT_127_total$|^TMT_128_total$|^TMT_129_total$|^TMT_130_total$|^TMT_131_total$|^deltaForwardReverseScore$|
|^sequence_length$|^delta_parent_mass$",names(pep))
} else {
index = grep("^accession_number$|^score$|^TMT_126_total$|^TMT_127_total$|^TMT_128_total$|^TMT_129_total$|^TMT_130_total$|^TMT_131_total$|^deltaForwardReverseScore$|
|^sequence_length$|^delta_parent_mass$",names(pep))
}
pep.parsed = pep[,index]
pep.parsed = pep.parsed[complete.cases(pep.parsed),]
return(pep.parsed)
}
cleaned_d1 = pepClean(d1)
View(cleaned_d1)
View(cleaned_d1)
t = read.delim("NgKD_CyberT_Peptide_Input_061316.txt")
View(t)
c.Out = read.delim(file = "NgKD_Cyber_T_test_Results_061316.txt")
View(c.Out)
colnames(c.Out)[1] = "id"
c.Key = read.delim(file = "NgKD_CyberT_Peptide_Input_Key_061316.txt")
View(c.Key)
cyberTMerge = function(cyT.output,cyT.key){
# Takes CyberT output and then merges it with the cyberT key which contains the peptide accession number and the entry_name
# Input:
#   cyT.output = The output of the CyberT test
#   cyT.key = They Cyber T test key which has the entry name and accession number
# Output:
#   cyT.merge = merged cyberT output with the key that give it the accession number and entry name
colnames(cyT.output)[1] == "id"
cyT.merge = merge(cyT.output,cyT.key, by = "id")
return(cyT.merge)
}
t1 = cyberTMerge(c.Out,c.Key)
View(t1)
cyberTSignif = function(cyT.merge,FDR = 0.01){
# Takes the merged cyberT output dataframe and adds a column which is indicative of singnificance based on a FDR cutoff set by the other input
# Input:
#    cyT.merge = merged cyberT output with the key
#    FDR = Benjamini-Hochberg cutoff
# Output:
#   cyT.Merge = merged cyber T output with an added column denoting significance.
cyT.merge$significant = ifelse(cyT.merge$BH < FDR,1,0)
return(cyT.merge)
}
t1 = cyberTSignif(t1)
View(t1)
cyberTReport = function(cyT.merge,cyT.key){
# Takes the cyberT output that was merged with the key, and then the key was used to get the unique accession numbers.
#   Input:
#      cyT.merge = merged Cybeer T output
#      cyT..key = the cyberT key that has the entry name and the accession numner
#   Output:
#     pep.summary = generates a dataframe rolled up by protein that shows
key = unique(cyT.key[,grep("accession_number|entry",names(cyT.key))])
pep.summary = ddply(cyT.merge, .(accession_number), summarize, freq = length(accession_number),
signif_peps = sum(significant), signif_percent = signif((signif_peps/freq)*100,3),
median_fold_Change = signif(median(fold),3))
pep.summary = merge(pep.summary,key,by = "accession_number")
return(pep.summary)
}
t2 = cyberTReport(t1,cyt.key)
t2 = cyberTReport(t1,c.Key)
View(t2)
cyT.input = read.delim(file = "NgKD_CyberT_Peptide_Input_061316.txt")
View(cyT.input)
cyberTPepSummary = function(cyT.input){
#Input:
#  cyT.input = cyberT input with the raw intensites. It is important that the columns are formatted that
#the experimental columns are denoted with an e and the control columns are denoted with a c
#  Output:
#    cyT.input =the cyberT input with raw intensities for each peptide, with the additional columns
#    providing information about average values as well as standard deviation.
#Grab the index for the Control & Experimental columns
control.index = grep("C",names(cyT.input))
exp.index = grep("E",names(cyT.input))
#Make all of the new columsn
cyT.input$avg_c = rowMeans(cyT.input[,control.index],na.rm = TRUE)
cyT.input$avg_c_sd = apply(cyT.input[,control.index],1,sd)
cyT.input$avg_e = rowMeans(cyT.input[,exp.index],na.rm = TRUE)
cyT.input$avg_e_sd = apply(cyT.input[,exp.index],1,sd)
cyT.input$cv_e = signif((cyT.input$avg_e_sd / cyT.input$avg_e) * 100,3)
cyT.input$cv_c = signif((cyT.input$avg_c_sd / cyT.input$avg_c) * 100,3)
cyT.input$ratio = cyT.input$avg_e/ cyT.input$avg_c
cyT.input$ratio_sd = cyT.input$ratio*sqrt((cyT.input$avg_c_sd/cyT.input$avg_c)^2 + (cyT.input$avg_e_sd/cyT.input$avg_e)^2)
return(cyT.input)
}
r1 = cyberTPepSummary(cyT.input)
View(r1)
cyberTPepRatio = function(cyT.input, cyT.key){
#Takes the cyberT input and creates a bunch of summary statistics to be used
#Input: cyT.input -> cyT.input with summary statistics
key = unique(cyT.key[,grep("accession",names(cyT.key))])
cyT.merge = merge(cyT.input, cyT.key, by = "id")
cyT.merge = cyT.merge[,-(grep("entry",names(cyT.merge)))]
pep.summary2 = ddply(cyT.merge, .(accession_number), summarize, freq = length(accession_number),
log2_median_ratio = signif(log(median(ratio),2),3),
log2_average_ratio = signif(log(mean(ratio),2),3),
linear_median_ratio = signif(median(ratio),3),
inear_average_ratio = signif(mean(ratio),3))
#  pep.summary2 = merge(pep.summary2,cyT.key,by = "accession_number")
return(pep.summary2)
}
r2 = cyberTPepRatio(r1,c.Key)
View(r2)
hist(r1$ratio)
hist(r1$ratio,breaks = "fd")
range(r1$ratio)
hist(r2$log2_median_ratio,breaks = "fd")
2^.24
cars
summary(cars$speed)
View(r1)
View(r2)
View(r2)
peptide[which(peptide$geneSymbol == "EPHX1"),]
peptide[which(peptide$sequence == "FVSLAELQ"),]
x = peptide[which(peptide$sequence == "FVSLAELQ"),]
View(x)