-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathload_and_clean.R
103 lines (98 loc) · 5.74 KB
/
load_and_clean.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#load libraries (find a more elegant way to check for libraries)
library(ggplot2)
library(gplots)
library(vegan)
library(ape)
library(gridExtra)
#### LOADING DATA ####
load_biom_file <- function(filename){
print("Need to implement <load_biom_file()>")
biom_file <- 1
return(biom_file)
}
# load sequence file, keeping sequences named in the novel_otus list
# deprecated for now, but add check for sequences that are not in the fasta file and throw an error
load_sequence_subset <- function(seq_filename, subset_names = "ALL"){
sequences <- readDNAStringSet(filepath = seq_filename)
print(length(subset_names))
if(toupper(subset_names) == "ALL"){
seq_subset <- sequences
} else{
#seq_subset <- sequences[sapply(subset_names, function(x) grep(x, names(sequences)))]
}
return(seq_subset)
}
# give the sum of the entire table or distributed across metadata categories
# takes the OTU table directly instead of the phyloseq object (e.g., otu_table(phyloseq_object))
sum_table_by_metadata <- function(otu_table_matrix, metadata_cat = "all"){
if(metadata_cat == "all"){
full_sum <- sum(otu_table_matrix)
return_table <- data.frame("all", full_sum, stringsAsFactors = FALSE)
colnames(return_table) <- c("METADATA", "TABLE_SUM")
} else {
list_of_metadata_values <- unique(metadata_mapping[,metadata_cat])
metadata_values <- character(length(list_of_metadata_values))
metadata_sums <- numeric(length(list_of_metadata_values))
for(i in 1:length(list_of_metadata_values)){
temp_list <- NULL
temp_list <- metadata_mapping[metadata_mapping[,metadata_cat] == list_of_metadata_values[i], 1]
temp_sums <- colSums(otu_table_matrix[,as.character(temp_list)])
metadata_values[i] <- as.character(list_of_metadata_values[i])
metadata_sums[i] <- sum(temp_sums)
}
return_table <- data.frame(metadata_values, metadata_sums, stringsAsFactors = FALSE)
#TODO (mdjlynch): this data frame has levels that interfere with downstream calculations (may be solved)
#return_table <- data.frame(metadata_values, metadata_sums)
colnames(return_table) <- c("METADATA", "TABLE_SUM")
}
return(return_table)
}
#############
# OTU TABLE #
#############
# #stacked plot from previous analysis (as guide):
# #otu_plot <- ggplot(otu.table,aes(Encoding,Count,fill=Phylum),alpha=Genus) + geom_bar(position="fill",stat="identity",colour="white") + coord_flip()
#
# # Abundance Threshold Taxonomic Cleveland Plot
# # -----|----- Order1 *************
# # Order2 *********
# # Metadata Category Order3 ****
# # 1:x 2: 3: 4: Order4 **
# #
# #
# # IMPLEMENT AS A SHINY CHART OR SHINYAPP EMBEDDED IN A DOCUMENT
# # Create an OTU table with taxonomy split up over the last seven columns, K | P | C | O | F | G | S
# # Note, this section and the next can be better optimized (since both are using similar data processing)
# # Note, keep otu table as data frame, converting to matrix only when doing the ordination
# # process OTU table, splitting taxonomy (this is ugly and hackish, but works right now - fix later)
# # from: http://stackoverflow.com/questions/7069076/split-column-at-delimiter-in-data-frame
# otu_table_taxsplit <- within(otu_table, Consensus.Lineage<-data.frame(do.call('rbind', strsplit(as.character(Consensus.Lineage), ';', fixed=TRUE))))
# otu_table_taxsplit <- cbind(otu_table_taxsplit, otu_table_taxsplit$Consensus.Lineage$X2, otu_table_taxsplit$Consensus.Lineage$X3,
# otu_table_taxsplit$Consensus.Lineage$X4, otu_table_taxsplit$Consensus.Lineage$X5,
# otu_table_taxsplit$Consensus.Lineage$X6, otu_table_taxsplit$Consensus.Lineage$X7)
#
# # rename columns from Consensus.Lineage.X1-X7 to KINGDOM, PHYLUM, CLASS, ORDER, FAMILY, GENUS
# # Exclude SPECIES for now
# # TODO: All of this hacking and splicing might be better dealt with when reading in the file (change
# # ';' to tabs, write to temp file, then read.table). Check: http://www.r-bloggers.com/using-r-reading-tables-that-need-a-little-cleaning/
# # Works as is for now
# otu_table$Consensus.Lineage.X1 <- NULL # remove the 'Root' column
# names(otu_table_taxsplit)[names(otu_table_taxsplit)=="otu_table_taxsplit$Consensus.Lineage$X2"] <- "KINGDOM"
# names(otu_table_taxsplit)[names(otu_table_taxsplit)=="otu_table_taxsplit$Consensus.Lineage$X3"] <- "PHYLUM"
# names(otu_table_taxsplit)[names(otu_table_taxsplit)=="otu_table_taxsplit$Consensus.Lineage$X4"] <- "CLASS"
# names(otu_table_taxsplit)[names(otu_table_taxsplit)=="otu_table_taxsplit$Consensus.Lineage$X5"] <- "ORDER"
# names(otu_table_taxsplit)[names(otu_table_taxsplit)=="otu_table_taxsplit$Consensus.Lineage$X6"] <- "FAMILY"
# names(otu_table_taxsplit)[names(otu_table_taxsplit)=="otu_table_taxsplit$Consensus.Lineage$X7"] <- "GENUS"
# #names(otu_table_taxsplit)[names(otu_table_taxsplit)=="otu_table_taxsplit$Consensus.Lineage$X8"] <- "SPECIES"
#
# # if cell doesn't start with k__ (or whatever), replace with Unclassified
# otu_table_taxsplit$KINGDOM <- gsub("^((?!k__).)*$", "Unclassified", otu_table_taxsplit$KINGDOM, perl=TRUE)
# otu_table_taxsplit$PHYLUM <- gsub("^((?!p__).)*$", "Unclassified", otu_table_taxsplit$PHYLUM, perl=TRUE)
# otu_table_taxsplit$CLASS <- gsub("^((?!c__).)*$", "Unclassified", otu_table_taxsplit$CLASS, perl=TRUE)
# otu_table_taxsplit$ORDER <- gsub("^((?!o__).)*$", "Unclassified", otu_table_taxsplit$ORDER, perl=TRUE)
# otu_table_taxsplit$FAMILY <- gsub("^((?!f__).)*$", "Unclassified", otu_table_taxsplit$FAMILY, perl=TRUE)
# otu_table_taxsplit$GENUS <- gsub("^((?!g__).)*$", "Unclassified", otu_table_taxsplit$GENUS, perl=TRUE)
# #otu_table_taxsplit$SPECIES <- gsub("^((?!s__).)*$", "Unclassified", otu_table_taxsplit$SPECIES, perl=TRUE)
#
# #delete Consensus.Lineage
# otu_table_taxsplit$Consensus.Lineage <- NULL