tools/horvath_clock/StepwiseAnalysis.R

# Steve Horvath: Estimating DNAm age.
# This file assumes a data frame exists called dat1 whose rows correspond to CpGs
# and whose first column reports the CpG identifier
# and whose remaining columns corresponds to samples (e.g. Illumina arrays).


fastImputation=FALSE

#STEP 1: DEFINE QUALITY METRICS

meanMethBySample =as.numeric(apply(as.matrix(dat1[,-1]),2,mean,na.rm=TRUE))
minMethBySample   =as.numeric(apply(as.matrix(dat1[,-1]),2,min,na.rm=TRUE))
maxMethBySample  =as.numeric(apply(as.matrix(dat1[,-1]),2,max,na.rm=TRUE))

datMethUsed= t(dat1[,-1])
colnames(datMethUsed)=as.character(dat1[,1])


noMissingPerSample=apply(as.matrix(is.na(datMethUsed)),1,sum)
table(noMissingPerSample)

#STEP 2: Imputing 
if (! fastImputation & nSamples>1 & max(noMissingPerSample,na.rm=TRUE)<3000 ){

# run the following code if there is at least one missing
if ( max(noMissingPerSample,na.rm=TRUE)>0 ){
dimnames1=dimnames(datMethUsed)
datMethUsed= data.frame(t(impute.knn(t(datMethUsed))$data))
dimnames(datMethUsed)=dimnames1
} # end of if
} # end of if (! fastImputation )

if ( max(noMissingPerSample,na.rm=TRUE)>=3000 ) fastImputation=TRUE


if ( fastImputation | nSamples==1 ){
noMissingPerSample=apply(as.matrix(is.na(datMethUsed)),1,sum)
table(noMissingPerSample)
if ( max(noMissingPerSample,na.rm=TRUE)>0 & max(noMissingPerSample,na.rm=TRUE) >= 3000 ) {normalizeData=FALSE}

# run the following code if there is at least one missing
if ( max(noMissingPerSample,na.rm=TRUE)>0 & max(noMissingPerSample,na.rm=TRUE) < 3000 ){
dimnames1=dimnames(datMethUsed)
for (i in which(noMissingPerSample>0) ){
selectMissing1=is.na(datMethUsed[i,])
datMethUsed[i,selectMissing1] = as.numeric(probeAnnotation21kdatMethUsed$goldstandard2[selectMissing1])
} # end of for loop
dimnames(datMethUsed)=dimnames1
} # end of if
} # end of if (! fastImputation )


# STEP 3: Data normalization (each sample requires about 8 seconds). It would be straightforward to parallelize this operation.

if (normalizeData ){
datMethUsedNormalized=BMIQcalibration(datM=datMethUsed,goldstandard.beta= probeAnnotation21kdatMethUsed$goldstandard2,plots=FALSE)
}
if (!normalizeData ){ datMethUsedNormalized=datMethUsed }
rm(datMethUsed); gc()

# Subset to common probes
common_probes <- intersect(as.character(datClock$CpGmarker[-1]), dimnames(datMethUsedNormalized)[[2]])
datClock <- datClock[datClock$CpGmarker %in% c("(Intercept)", common_probes), ]
datMethUsedNormalized <- datMethUsedNormalized[, colnames(datMethUsedNormalized) %in% common_probes]

#STEP 4: Predict age and create a data frame for the output (referred to as datout)
selectCpGsClock=is.element(dimnames(datMethUsedNormalized)[[2]], as.character(datClock$CpGmarker[-1]))
if ( sum( selectCpGsClock) < dim(datClock)[[1]]-1 ) {stop("The CpGs listed in column 1 of the input data did not contain the CpGs needed for calculating DNAm age. Make sure to input cg numbers such as cg00075967.")}
if ( sum( selectCpGsClock) > dim(datClock)[[1]]-1 ) {stop("ERROR: The CpGs listed in column 1 of the input data contain duplicate CpGs. Each row should report only one unique CpG marker (cg number).")}
if (nSamples>1 ) {
datMethClock0=data.frame(datMethUsedNormalized[,selectCpGsClock])
datMethClock= data.frame(datMethClock0[ as.character(datClock$CpGmarker[-1])])
dim(datMethClock)
predictedAge=as.numeric(anti.trafo(datClock$CoefficientTraining[1]+as.matrix(datMethClock)%*% as.numeric(datClock$CoefficientTraining[-1])))
} # end of if


if (nSamples==1 ) {
datMethUsedNormalized2=data.frame(rbind(datMethUsedNormalized,datMethUsedNormalized))
datMethClock0=data.frame(datMethUsedNormalized2[,selectCpGsClock])
datMethClock= data.frame(datMethClock0[ as.character(datClock$CpGmarker[-1])])
dim(datMethClock)
predictedAge=as.numeric(anti.trafo(datClock$CoefficientTraining[1]+as.matrix(datMethClock)%*% as.numeric(datClock$CoefficientTraining[-1])))
predictedAge=predictedAge[1]
} # end of if


# Let's add comments to the age prediction
Comment=ifelse ( predictedAge <0, "Negative DNAm age.", ifelse ( predictedAge >100, "Old DNAm age.", rep("",length(predictedAge))))

Comment[is.na(predictedAge)]="Age prediction was not possible. "


if ( sum( selectCpGsClock) < dim(datClock)[[1]]-1 ) {
Comment=rep("ERROR: The CpGs listed in column 1 of the input data did not contain the CpGs needed for calculating DNAm age. Make sure to input cg numbers such as cg00075967.",length(predictedAge) )}


if ( sum( selectCpGsClock) > dim(datClock)[[1]]-1 ) {
Comment=rep("ERROR: The CpGs listed in column 1 of the input data contain duplicate CpGs. Each row should report only one unique CpG marker (cg number).",length(predictedAge) )}


restSamples=-minMethBySample>0.05 | maxMethBySample>1.05;
restSamples[is.na(restSamples)]=FALSE
lab1="MAJOR WARNING: Probably you did not input beta values since either minMethBySample<-0.05 or maxMethBySample>1.05.";Comment[restSamples]= paste(Comment[restSamples],lab1)

restSamples= noMissingPerSample >0 & noMissingPerSample <=100;lab1="WARNING: Some beta values were missing, see noMissingPerSample."; Comment[restSamples]= paste(Comment[restSamples],lab1)
restSamples= noMissingPerSample >3000;lab1="MAJOR WARNING: More than 3k missing values!!"; Comment[restSamples]= paste(Comment[restSamples],lab1)

restSamples= noMissingPerSample >100 & noMissingPerSample <=3000 ;lab1="MAJOR WARNING: noMissingPerSample>100"
Comment[restSamples]= paste(Comment[restSamples],lab1)
restSamples=meanMethBySample>.35;
restSamples[is.na(restSamples)]=FALSE
lab1="Warning: meanMethBySample is >0.35";Comment[restSamples]= paste(Comment[restSamples],lab1)
restSamples=meanMethBySample<.25;
restSamples[is.na(restSamples)]=FALSE; lab1="Warning: meanMethBySample is <0.25"
Comment[restSamples]= paste(Comment[restSamples],lab1)
datout=data.frame(SampleID=colnames(dat1)[-1], DNAmAge=predictedAge, Comment, noMissingPerSample,meanMethBySample, minMethBySample, maxMethBySample)


if ( !is.null( meanXchromosome) ){  

if ( length( meanXchromosome)==dim(datout)[[1]] ){
predictedGender=ifelse(meanXchromosome>.4,"female",
ifelse(meanXchromosome<.38,"male","Unsure"))
datout=data.frame(datout,predictedGender=predictedGender,meanXchromosome=meanXchromosome)

} # end of if 

 } # end of if