Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

User defined columns in phenodata #11

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion tools/microarray/R/average-replicates.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@
# rewritten by IS, 6.9.2010
# modified by IS, 16.2.2011
# modified by IS,12.10.2012
# modified by OH,09.09.2021, additional parameters for phenodata read.table

# load inputs
file <- 'normalized.tsv'
dat <- read.table(file, header=TRUE, sep='\t', quote='', row.names=1, check.names=FALSE)
phenodata <- read.table('phenodata.tsv', header=TRUE, sep='\t', as.is=TRUE)
phenodata <- read.table('phenodata.tsv', header=TRUE, sep='\t', quote='', as.is=TRUE, check.names=FALSE, comment.char='')

# identify replicates
replicates <- unique(phenodata[duplicated(phenodata[,column]), column])
Expand Down
56 changes: 30 additions & 26 deletions tools/microarray/R/calculate-fold-change.R
Original file line number Diff line number Diff line change
@@ -1,28 +1,32 @@
# TOOL calculate-fold-change.R: "Calculate fold change" (Calculates a geometric average of gene expression for replicate chips and then
# calculates a difference or ratio between the averages. The output fold change will be represented in log2 scale.
# TOOL calculate-fold-change.R: "Calculate fold change" (Calculates a geometric or arithmetic average of gene expression for replicate chips and then
# calculates a difference or ratio between the averages. The output fold change can be represented either in log2 or linear scale.
# Note that the tool is applicable only if you have defined two groups of samples, and the reference group is determined by the smaller group number.)
# INPUT normalized.tsv: normalized.tsv TYPE GENE_EXPRS
# INPUT META phenodata.tsv: phenodata.tsv TYPE GENERIC
# OUTPUT fold-change.tsv: fold-change.tsv
# PARAMETER column: Column TYPE METACOLUMN_SEL DEFAULT group (Phenodata column describing the groups. Samples of which group attribute is NA are excluded from the analysis.)
# PARAMETER geometric: "Which mean to use" TYPE [geometric: geometric, arithmetic: arithmetic] DEFAULT geometric (Should the geometric or arithmetic mean be used in the calculation of average expression for the sample groups?)
# PARAMETER outscale: Scale TYPE [log2: log2, linear: linear] DEFAULT log2 (What scale to use for expressing the results. Log2 yields a symmetric distribution around zero with no change being equal to 0, up-regulation taking positive values and down-regulation negative values. Conversely, in linear scale up-regulation is represented by values greater than 1 and down-regulation values being between 0 and 1.)

# JTT 30.7.2007, Calculate fold changes between groups of samples
# MG, 3.5.2011, added parameters for choosing aritmetic or geometric mean and for choosing linear or log scale
# OH, 30.11.2011, support for names as group categories
# MG, 30.11.2011, added parameter do define reference group
# ML, 18.10.2016
# ES, 27.06.2021 set mean to geometric and scale to log2
# PARAMETER OPTIONAL geometric: "Which mean to use" TYPE [yes:geometric, no:arithmetic] DEFAULT yes (Should the geometric or arithmetic mean be used in the calculation of average expression for the sample groups?)
# PARAMETER OPTIONAL scale: Scale TYPE [log2, linear] DEFAULT log2 (What scale to use for expressing the results. Log2 yields a symmetric distribution around zero with no change being equal to 0, up-regulation taking positive values and down-regulation negative values. Conversely, in linear scale
# up-regulation is represented by values greater than 1 and down-regulation values being between 0 and 1.)
# OH, 09.09.2021, additional parameters for phenodata read.table
# OH, 09.09.2021, option geometric or arithmetic mean and option output log2 or linear

if( ! exists("geometric", mode="character") ) {
geometric <- "geometric"
}
if( ! exists("outscale", mode="character") ) {
outscale <- "log2"
}


geometric<- "yes"
scale<- "log2"
# Loads the normalized data and phenodata
file<-c("normalized.tsv")
dat<-read.table(file, header=T, sep="\t", row.names=1)
phenodata<-read.table("phenodata.tsv", header=T, sep="\t")
phenodata<-read.table("phenodata.tsv", header=T, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')

#remove rows that are NAs
na_samples <- phenodata[which(is.na(phenodata[,pmatch(column,colnames(phenodata))])), 1]
Expand All @@ -46,10 +50,10 @@ if(length(unique(groups))>2) {
stop("CHIPSTER-NOTE: You have more than two groups! I don't know how to calculate fold change.")
}

# If arithmetic mean, then transform values to linear scale, arithmetic option removed
#if (geometric == "no") {
#"dat2 <- as.data.frame (2^dat2)
#}
# If arithmetic mean, then transform values to linear scale
if (geometric == "arithmetic") {
dat2 <- as.data.frame (2^dat2)
}

# Calculating averages
columnnames<-c()
Expand All @@ -70,21 +74,21 @@ rownames(dat3)<-rownames(dat2)

# Calculating the fold change
# Treatment divided by the control
if (geometric == "yes") {
if (geometric == "geometric") {
FC <- dat3[,2]-dat3[,1]
} #else {
#FC <- dat3[,2] / dat3 [,}

} else {
FC <- dat3[,2] / dat3 [,1]
}

# If arithmetic mean and log2 scale, then transform values back to log2 scale, arithmetic mean removed
#if (geometric == "no" && scale == "log2") {
# FC <- log2(FC)
#}
# If arithmetic mean and log2 scale, then transform values back to log2 scale
if (geometric == "arithmetic" && outscale == "log2") {
FC <- log2(FC)
}

# If geometric mean and linear scale, then transform values to linear scale, linear scale removed
#if (geometric == "yes" && scale == "linear") {
# FC <- 2^FC
#}
# If geometric mean and linear scale, then transform values to linear scale
if (geometric == "geometric" && outscale == "linear") {
FC <- 2^FC
}

# Saving the results
write.table(data.frame(dat, FC=FC), file="fold-change.tsv", sep="\t", row.names=T, col.names=T, quote=F)
Expand Down
4 changes: 3 additions & 1 deletion tools/microarray/R/classification-knn.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

# JTT 26.6.2006: KNN classification created
# MK 02.07.2013: Bugs fixed, added group and traning column selection to the header section
# modified by OH,09.09.2021, additional parameters for phenodata read.table


# Parameter settings (default) for testing purposes
#number.of.nearest.neighbors<-2
Expand All @@ -29,7 +31,7 @@ file<-c("normalized.tsv")
dat<-read.table(file, sep="\t", header=T, row.names=1)

# Reads the phenodata table
phenodata<-read.table("phenodata.tsv", header=T, sep="\t")
phenodata<-read.table("phenodata.tsv", header=T, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')

# Checks whether the training variable is present if phenodata
# If training part of phenodata has not been filled, but the column (header) is present,
Expand Down
3 changes: 2 additions & 1 deletion tools/microarray/R/classification.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# JTT 22.01.2009
# MK 18.06.2013 Chip-prediction table added to the output
# Prediction has not yet been implemented!
# modified by OH,09.09.2021, additional parameters for phenodata read.table

# Parameter settings (default) for testing purposes
#method<-"knn"
Expand Down Expand Up @@ -43,7 +44,7 @@ if(nrow(dat) > max.genes) {
}

# Reads the phenodata table
phenodata<-read.table("phenodata.tsv", header=T, sep="\t")
phenodata<-read.table("phenodata.tsv", header=T, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')

# Separates expression values and flags
calls<-dat[,grep("flag", names(dat))]
Expand Down
7 changes: 4 additions & 3 deletions tools/microarray/R/correlation-analysis-mirna.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
# IS, 8.6.2010, bugfix
# IS, 28.7.2010, now allows additional samples in the two data sets, but only those ones with a pair are used in the analysis
# MG, 10.2.2011, now gets the gene symbols from the org.HS.eg.db package and adds rownames the output tables to allow use of Venn diagram
# modified by OH,09.09.2021, additional parameters for phenodata read.table

# Loads the libraries
library(RmiR)
Expand All @@ -25,8 +26,8 @@ library(org.Hs.eg.db)
# Loads the normalized data and phenodata files
data_1 <- read.table(file="normalized_mirna.tsv", header=T, sep="\t", row.names=1)
data_2 <- read.table(file="normalized_gene.tsv", header=T, sep="\t", row.names=1)
phenodata_1 <- read.table("phenodata_mirna.tsv", header=T, sep="\t")
phenodata_2 <- read.table("phenodata_gene.tsv", header=T, sep="\t")
phenodata_1 <- read.table("phenodata_mirna.tsv", header=T, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')
phenodata_2 <- read.table("phenodata_gene.tsv", header=T, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')

# Figure out which is the miRNA data
if (phenodata_1$chiptype[1] == "miRNA") {
Expand Down Expand Up @@ -150,4 +151,4 @@ results.negative.significant <- results.negative[results.negative[,5]<=p.value.t
write.table(results.positive.significant, file="mirna-gene-positive-correlation.tsv", sep="\t", quote=FALSE, row.names=TRUE)
write.table(results.negative.significant, file="mirna-gene-negative-correlation.tsv", sep="\t", quote=FALSE, row.names=TRUE)

# EOF
# EOF
3 changes: 2 additions & 1 deletion tools/microarray/R/delete-and-subtract-columns.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

# Deletes columns from data and subtacts their (average values) from other columns which they are associated with
# MK 13.08.2013
# modified by OH,09.09.2021, additional parameters for phenodata read.table

# Sanity checks
if(column1=="EMPTY" || column2=="EMPTY") {
Expand All @@ -17,7 +18,7 @@ if(column1=="EMPTY" || column2=="EMPTY") {
# Loads the data
file<-c("normalized.tsv")
dat<-read.table(file, sep="\t", header=T, row.names=1)
phenodata <- read.table('phenodata.tsv', header=TRUE, sep='\t', as.is=TRUE)
phenodata <- read.table('phenodata.tsv', header=TRUE, sep='\t', quote='', as.is=TRUE, check.names=FALSE, comment.char='')

# identify matrices (chip, flag, segmented, ...) present in the data
datnames <- colnames(dat)
Expand Down
6 changes: 4 additions & 2 deletions tools/microarray/R/merge-datasets.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
# Ilari Scheinin <[email protected]>
# 2012-10-12

# modified by OH,09.09.2021, additional parameters for phenodata read.table

# read data set 1
file1 <- 'normalized_1.tsv'
dat <- read.table(file1, header=TRUE, sep='\t', quote='', row.names=1, check.names=FALSE)
Expand Down Expand Up @@ -60,8 +62,8 @@ if (ncol(ratios) == ncol(calls))
dat <- cbind(dat, calls)

# process phenodata
phenodata1 <- read.table('phenodata_1.tsv', header=TRUE, sep='\t', as.is=TRUE)
phenodata2 <- read.table('phenodata_2.tsv', header=TRUE, sep='\t', as.is=TRUE)
phenodata1 <- read.table('phenodata_1.tsv', header=TRUE, sep='\t', quote='', as.is=TRUE, check.names=FALSE, comment.char='')
phenodata2 <- read.table('phenodata_2.tsv', header=TRUE, sep='\t', quote='', as.is=TRUE, check.names=FALSE, comment.char='')

# fill in columns present only in one phenodata table
for (col in setdiff(colnames(phenodata1), colnames(phenodata2)))
Expand Down
3 changes: 2 additions & 1 deletion tools/microarray/R/norm-combat.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# ComBat batch correction analysis
# MK 25.06.2013
# ML 24.08.2016 Fixed (batch as.numeric)
# modified by OH,09.09.2021, additional parameters for phenodata read.table

# Loading libraries
library(sva)
Expand All @@ -19,7 +20,7 @@ dat <- read.table(file, header=T, sep="\t", row.names=1)

# Loads Batch information
file <- "phenodata.tsv";
phenodata <- read.table(file, header=T, sep="\t")
phenodata <- read.table(file, header=T, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')

# Merge batches to matrix
if(batch1 == "EMPTY" & batch2 == "EMPTY" & batch3 == "EMPTY") { stop("CHIPSTER-NOTE: no batches defined"); }
Expand Down
4 changes: 3 additions & 1 deletion tools/microarray/R/norm-lme.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
# JTT: 12.7.2006 Crated linear Mixed Model
# JTT: 19.10.2006: Modified to use nlme library
# MK: 22.05.2013: Noise factor parameter added
# modified by OH,09.09.2021, additional parameters for phenodata read.table


#column.groups<-"group"
#column.random<-"age"
Expand All @@ -26,7 +28,7 @@ calls<-dat[,grep("flag", names(dat))]
dat2<-dat[,grep("chip", names(dat))]

# Loads phenodata
phenodata<-read.table("phenodata.tsv", header=T, sep="\t")
phenodata<-read.table("phenodata.tsv", header=T, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')

# Test needs a parameter "groups" that specifies the grouping of the samples
# and a parameter random that specifies the random effect such as day or technician
Expand Down
3 changes: 2 additions & 1 deletion tools/microarray/R/norm-specific-samples.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

# Normalize the data to specific samples
# JTT 5.12.2008
# modified by OH,09.09.2021, additional parameters for phenodata read.table

# Loads the data file
file<-c("normalized.tsv")
Expand All @@ -16,7 +17,7 @@ dat<-read.table(file, header=T, sep="\t", row.names=1)
dat2<-dat[,grep("chip", names(dat))]

# Loads phenodata
phenodata<-read.table("phenodata.tsv", header=T, sep="\t")
phenodata<-read.table("phenodata.tsv", header=T, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')

# Extract the data from the phenodata column
extract<-phenodata[,which(column.to.normalize.by==colnames(phenodata))]
Expand Down
3 changes: 2 additions & 1 deletion tools/microarray/R/plot-annoheatmap.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

# MK 17.10.2013 Annotated Heatmap
# ML 29.07.2019 Add option to use gene symbols as row names
# modified by OH,09.09.2021, additional parameters for phenodata read.table

library("Heatplus")

Expand All @@ -40,7 +41,7 @@ if (ncol(dat2) > 20000) {
}

# Read phenodata and generate annotation features
phenodata<-read.table("phenodata.tsv", header=T, sep="\t")
phenodata<-read.table("phenodata.tsv", header=T, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')
colnames(dat2)<-gsub(" ", "", phenodata$description)
if(column != "EMPTY") {
groups<-phenodata[,pmatch(column,colnames(phenodata))]
Expand Down
3 changes: 2 additions & 1 deletion tools/microarray/R/plot-boxplot.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# JTT 02.10.2007: Boxplot
# MG 15.09.2011: Updated colors and legend
# MK 10.10.2013: User can choose which phenodata column is used for coloring bars
# modified by OH,09.09.2021, additional parameters for phenodata read.table

# Renaming variables
w<-image.width
Expand All @@ -23,7 +24,7 @@ calls<-dat[,grep("flag", names(dat))]
dat2<-dat[,grep("chip", names(dat))]

# Loads phenodata
phenodata<-read.table("phenodata.tsv", header=T, sep="\t")
phenodata<-read.table("phenodata.tsv", header=T, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')
if(length(grep(column, colnames(phenodata))) == 0) {
stop("CHIPSTER-NOTE: You need to select phenodata column to run this analysis")
}
Expand Down
3 changes: 2 additions & 1 deletion tools/microarray/R/plot-dendrogram.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

# JTT 03.10.2007: Dendrogram
# MG 25.11.2010: Increased the gene/sample limit to 20000
# modified by OH,09.09.2021, additional parameters for phenodata read.table

# Renaming variables
w<-image.width
Expand All @@ -28,7 +29,7 @@ file<-c("normalized.tsv")
dat<-read.table(file, header=T, sep="\t", row.names=1)

# Loads the phenodata
phenodata<-read.table("phenodata.tsv", header=T, sep="\t")
phenodata<-read.table("phenodata.tsv", header=T, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')
groups<-phenodata[,pmatch(column,colnames(phenodata))]

# Separates expression values and flags
Expand Down
4 changes: 3 additions & 1 deletion tools/microarray/R/sample-size-with-bh.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@
# Ilari Scheinin <[email protected]>
# 2012-10-12

# modified by OH,09.09.2021, additional parameters for phenodata read.table

file <- 'normalized.tsv'
dat <- read.table(file, header=TRUE, sep='\t', quote='', row.names=1, check.names=FALSE)
phenodata <- read.table("phenodata.tsv", header=TRUE, sep="\t")
phenodata <- read.table("phenodata.tsv", header=TRUE, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')
groups <- phenodata[,column]

if(length(unique(groups[!is.na(groups)]))!=2)
Expand Down
3 changes: 2 additions & 1 deletion tools/microarray/R/sort-samples.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@

# JTT, 06.02.2008, Sort samples
# MG, 16.11.2010, modified to also generate a re-ordered phenodata file to reflect the re-ordered data
# modified by OH,09.09.2021, additional parameters for phenodata read.table

# Default parameters
#column<-"group"

# Loads libraries
phenodata<-read.table("phenodata.tsv", header=T, sep="\t")
phenodata<-read.table("phenodata.tsv", header=T, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')

# Loads data (which file to search)
file<-c("normalized.tsv")
Expand Down
3 changes: 2 additions & 1 deletion tools/microarray/R/stat-chisq-snp.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

# JTT: 25.4.2008: Association analysis with normalized SNP data
# MG, 14.10.2009: modified
# modified by OH,09.09.2021, additional parameters for phenodata read.table

# Read in data
dat<-read.table("normalized.tsv", header=T, sep="\t", row.names=1, comment.char = "")
Expand All @@ -16,7 +17,7 @@ calls<-dat[,grep("flag", names(dat))]
dat2<-dat[,grep("chip", names(dat))]

# Test needs a parameter "groups" that specifies the grouping of the samples
phenodata<-read.table("phenodata.tsv", header=T, sep="\t")
phenodata<-read.table("phenodata.tsv", header=T, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')
groups<-phenodata[,pmatch(column,colnames(phenodata))]

# Select only samples in group 1 (controls)
Expand Down
3 changes: 2 additions & 1 deletion tools/microarray/R/stat-correlate-with-phenodata.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

# Find genes that correlate with phenodata
# JTT 19.7.2007
# modified by OH,09.09.2021, additional parameters for phenodata read.table

# Parameter settings (default) for testing purposes
#correlation.cutoff<-c(0.95)
Expand All @@ -28,7 +29,7 @@ calls<-dat[,grep("flag", names(dat))]
dat2<-dat[,grep("chip", names(dat))]

# Test needs a parameter "groups" that specifies the grouping of the samples
phenodata<-read.table("phenodata.tsv", header=T, sep="\t")
phenodata<-read.table("phenodata.tsv", header=T, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')
groups<-phenodata[,pmatch(column,colnames(phenodata))]

# Sanity checks
Expand Down
3 changes: 2 additions & 1 deletion tools/microarray/R/stat-geneset.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@
# MG 03.05.2010: to exclude single gene genesets and added group labels in plots. Added more info columns in results table
# MK 03.09.2013: updated to R.3.0 which does not support globaltest / geneplot functions
# EK 05.11.2014: clarified the text.
# modified by OH,09.09.2021, additional parameters for phenodata read.table

#column<-"group"
#pathway.or.genelist <-"KEGG"
#use.multiple.testing.correction <-"yes"
#number.of.groups.to.visualize <-16

# Loading the libraries
phenodata<-read.table("phenodata.tsv", header=T, sep="\t")
phenodata<-read.table("phenodata.tsv", header=T, sep="\t", quote='', as.is=TRUE, check.names=FALSE, comment.char='')
if(phenodata$chiptype[1]!="cDNA" | phenodata$chiptype[1]!="Illumina") {
# Saves the chiptype into object lib
lib<-phenodata$chiptype[1]
Expand Down
Loading