-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdistribution_mean.Rmd
85 lines (71 loc) · 2.5 KB
/
distribution_mean.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
---
title: Test File
---
```{r}
# Load necessary libraries
library(readr)
library(ggplot2)
```
```{r}
taxonomy <- 4
file_name_vector <- c("./subset_tables/subset_table_phylum.tsv",
"./subset_tables/subset_table_class.tsv",
"./subset_tables/subset_table_order.tsv",
"./subset_tables/subset_table_family.tsv",
"./subset_tables/subset_table_genus.tsv",
"./subset_tables/subset_table_species.tsv")
file_name <- file_name_vector[taxonomy]
```
```{r}
# Read the TSV file into R, skip the first row since it contains a description
mydata <- read_tsv(file_name,
skip = 1)
# Calculate mean and variance for each row
row_means <- apply(mydata[, -1], 1, mean) # Assuming first column is row names
row_variances <- apply(mydata[, -1], 1, var)
```
```{r}
# Create a data frame to store the means and variances
mean_variance_data <- data.frame(
Bacteria = mydata[[1]], # Assuming the first column contains bacteria names
Mean = row_means,
Variance = row_variances
)
```
```{r}
# Define the cutoff value for the mean
cutoff <- 0.001
# Filter the data to include only rows with Mean >= cutoff
mean_variance_data_filtered <- subset(mean_variance_data, Mean >= cutoff)
# Optional: Display the number of rows before and after filtering
total_rows <- nrow(mean_variance_data)
filtered_rows <- nrow(mean_variance_data_filtered)
cat("Total rows before filtering:", total_rows, "\n")
cat("Total rows after filtering:", filtered_rows, "\n")
cat("Number of rows removed:", total_rows - filtered_rows, "\n")
```
```{r}
# Plot the distribution of means
ggplot(mean_variance_data_filtered, aes(x = Mean)) +
geom_histogram(binwidth = 0.005, fill = "blue", alpha = 0.7) +
theme_minimal() +
labs(title = "Distribution of Means", x = "Mean", y = "Frequency")
```
```{r}
# Plot Variance against Mean on Logarithmic X-axis
p <- ggplot(mean_variance_data_filtered, aes(x = Mean, y = Variance)) +
geom_point(alpha = 0.6, color = "darkgreen") +
theme_minimal() +
labs(title = "Variance vs. Mean", x = "Mean (log scale)", y = "Variance") +
scale_x_log10() +
geom_smooth(method = "lm", se = FALSE, color = "red") # Add a trend line
```
```{r}
# calculate the covariance of the mean_variance_data_filtered
ggsave("variance_mean.pdf",
plot = p,
width = 15,
height = 8)
```
copy datasets to scc
scp -r /home/hongrui_liu/CAMH/American_Gut_R/subset_tables [email protected]:/nethome/kcni/hliu/projects/hliurepo