-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcosine_50sample.R
158 lines (111 loc) · 4.77 KB
/
cosine_50sample.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# -----------------------------------------------
# Final R Script for Computing and Visualizing Similarity Matrix for First 50 Samples
# -----------------------------------------------
# Load libraries
library(proxy)
library(data.table)
library(Matrix)
library(ggplot2)
library(pheatmap)
library(umap)
# 1. Data Preparation
file_path <- "subset_tables/subset_table.tsv" # note this is the phylum level
# Read the TSV file, skipping the first descriptive title row
# - header = TRUE: the first row after skipping is treated as header (sample names)
# - sep = "\t": tab-separated values
# - skip = 1: skip the first descriptive row
cat("Reading data...\n")
mydata <- fread(file_path, header = TRUE, sep = "\t", skip = 1)
print(dim(mydata)) # Expected: 39 rows x 9512 columns
# Extract OTU IDs and OTU counts
otu_ids <- mydata[[1]] # First column: OTU IDs
otu_counts <- as.matrix(mydata[, -1]) # Remove the first column to get OTU counts
rownames(otu_counts) <- otu_ids # Assign OTU IDs as row names
# Transpose the data to have samples as rows and OTUs as columns
sample_data <- t(otu_counts) # Dimensions: 9511 samples x 39 OTUs
# Select the first 50 samples
subset_size <- 200
# make srue subset_size is less than the number of samples
if (subset_size > nrow(sample_data)) {
stop("Subset size exceeds the number of available samples.")
}
subset_data <- sample_data[1:subset_size, ] # First 50 samples
cat("Selected the first", subset_size, "samples for analysis.\n")
print(dim(subset_data)) # Expected: 50 x 39
# Convert to numeric (if not already)
subset_data <- apply(subset_data, 2, as.numeric)
# Verify the dimensions
print(dim(subset_data)) # Should be 50 x 39
# 2. Computing the Similarity Matrix
# Normalize the data (optional but recommended for similarity measures)
# Z-score normalization
cat("Normalizing subset data...\n")
subset_data_norm <- scale(subset_data)
# Compute Cosine Similarity Matrix
cat("Computing cosine similarity matrix for the subset...\n")
similarity_matrix <- simil(subset_data_norm,
method = "cosine",
diag = TRUE,
upper = TRUE)
# Convert to a dense matrix
similarity_matrix <- as.matrix(similarity_matrix)
# Verify dimensions
print(dim(similarity_matrix)) # Should be 50 x 50
# 3. Visualizing the Similarity Matrix
# a. Heatmap Visualization
cat("Generating heatmap for the similarity matrix...\n")
pheatmap(similarity_matrix,
cluster_rows = TRUE, # Cluster rows to group similar samples
cluster_cols = TRUE, # Cluster columns similarly
show_rownames = TRUE, # Show sample names on rows
show_colnames = TRUE, # Show sample names on columns
color = colorRampPalette(c("blue", "white", "red"))(100),
main = paste("Cosine Similarity Matrix (", subset_size, "Samples)",
sep = ""))
# b. Dimensionality Reduction with UMAP (Optional)
# Convert similarity to distance
distance_matrix_subset <- 1 - similarity_matrix
cat("Applying UMAP on the subset distance matrix...\n")
umap_config <- umap.defaults
umap_config$n_neighbors <- 15
umap_config$min_dist <- 0.1
umap_result <- umap(as.dist(distance_matrix_subset), config = umap_config)
# Plot UMAP results
cat("Plotting UMAP projection...\n")
plot(umap_result$layout,
col = "steelblue",
pch = 19,
main = paste("UMAP Projection of", subset_size, "Samples"),
xlab = "UMAP 1",
ylab = "UMAP 2",
col.main = "black")
# Optionally, add sample labels
text(umap_result$layout, labels = rownames(subset_data), pos = 4, cex = 0.7)
# 4. Saving Results (Optional)
# Save the similarity matrix
saveRDS(similarity_matrix, "cosine_similarity_matrix_subset50.rds")
cat("Cosine similarity matrix for the subset saved as 'cosine_similarity_matrix_subset50.rds'\n")
# Save the heatmap (if not interactive)
# You can save the plot using the png or pdf device
png(filename = "cosine_similarity_heatmap_subset50.png", width = 800, height = 800)
pheatmap(similarity_matrix,
cluster_rows = TRUE,
cluster_cols = TRUE,
show_rownames = TRUE,
show_colnames = TRUE,
color = colorRampPalette(c("blue", "white", "red"))(100),
main = paste("Cosine Similarity Matrix (", subset_size, "Samples)", sep = ""))
dev.off()
cat("Heatmap saved as 'cosine_similarity_heatmap_subset50.png'\n")
# Save the UMAP plot
png(filename = "UMAP_projection_subset50.png", width = 800, height = 600)
plot(umap_result$layout,
col = "steelblue",
pch = 19,
main = paste("UMAP Projection of", subset_size, "Samples"),
xlab = "UMAP 1",
ylab = "UMAP 2",
col.main = "black")
text(umap_result$layout, labels = rownames(subset_data), pos = 4, cex = 0.7)
dev.off()
cat("UMAP projection saved as 'UMAP_projection_subset50.png'\n")