-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdada2venn_v2.jl
146 lines (127 loc) · 5.09 KB
/
dada2venn_v2.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
using DataFrames, CSV
using RCall
#using StatsBase
##read dada2 output files, sample table
function input2df(asv_file, taxa_file, sample_file)
asv = CSV.File(asv_file) |> DataFrame;
taxa = CSV.File(taxa_file) |> DataFrame;
taxa = permutedims(taxa, 1)
sample = CSV.File(sample_file) |> DataFrame;
sample.Treatment = strip.(sample.Treatment)
return asv, taxa, sample
end
##merge these table above to be a metadata table
function df_merger(asv, taxa, sample)
asv_taxa = vcat(asv, taxa)
new_asv_id = ["ASV-$(i-1)" for i in 2:ncol(asv_taxa)]
rename!(asv_taxa, names(asv_taxa)[2:end] .=> new_asv_id)
leftjoin!(asv_taxa, sample, on = "Column1" => "Sample Name")
#unique(asv_taxa.Treatment)
#remove taxa rows or with "missing" in the last few columns
df_taxa = filter(:Treatment => ismissing, asv_taxa)
#extract the rows with a treatment targeted.
filter!(:Treatment => !ismissing, asv_taxa)
#rename!(asv_taxa, Dict(:Column1 => "Sample"))
return asv_taxa, df_taxa
end
##filter ASV with few read support and existing in few samples
function asv_filter(asv, treatment)
filter!(:Treatment => x -> x ∈ treatment, asv)
#CSV.write("asv.csv", asv)
group_label = length(treatment) > 1 ? "Treatment" : "Column1"
samples_cutoff = length(treatment) > 1 ? 2 : 1
println("groupby: $group_label")
##split-apply-combine: filter ASVs within a treatment
gdfs = groupby(asv, "$(group_label)")
group_ordered = []; gdfs_filter = []
for gdf in gdfs
push!(group_ordered, unique(gdf[:, "$(group_label)"]))
#println("before\t", size(gdf))
select_c = [1,]
for n in 2:ncol(gdf)-3
c = (gdf[:, n] .> 5 ) |> count # more than 5 reads per sample, two samples at least per treatment
(c >= samples_cutoff) && push!(select_c, n)
end
gdf = select(gdf, select_c)
#println("after\t", size(gdf))
push!(gdfs_filter, gdf)
end
return gdfs_filter, group_ordered
end
##extract good ASV based on any taxonomy classifcation and term
function asv_taxa_extract(asv_in, taxa_in, sample_in, treatment)
(asv_taxa, df_taxa) = df_merger(asv_in, taxa_in, sample_in)
#vscodedisplay(df_taxa)
#CSV.write("taxa.csv", df_taxa)
(asv_filter_dfs, groups) = asv_filter(asv_taxa, treatment)
println("#Groups:\n", groups)
println("#Number\tASVs")
#vscodedisplay(asv_filter_dfs[1])
#CSV.write("asv_filter.csv", asv_filter_dfs[1])
gdfs_filter_term = [vcat(gdf, df_taxa, cols=:intersect) for gdf in asv_filter_dfs]
#vscodedisplay(gdfs_filter_term[1])
#CSV.write("gdfs_filter_term.csv", gdfs_filter_term[1])
## to get the index of one rank
ranks = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
rank_i = findfirst(rank .== ranks) #faster and efficient
##extract ASVs assinged to a taxonomy term.
asv_list = Vector{String}[]; #asv_list_c = Vector{String}[]
for gdf in gdfs_filter_term
asv_detected = String[]; #asv_detected_c = String[]
taxa_detected = collect(gdf[end-7+rank_i, :])
# for n in 2:length(taxa_detected)
# if term == taxa_detected[n]
# push!(asv_detected, names(gdf)[n])
# #[append!(asv_detected_c, repeat([names(gdf)[n],], reads)) for reads in gdf[1:end-7, n]]
# end
# end
asv_detected = ifelse.(taxa_detected[2:end] .== term, names(gdf)[2:end], missing)
asv_detected = filter(!ismissing, asv_detected)
println(length(asv_detected), "\t", asv_detected)
push!(asv_list, asv_detected)
#push!(asv_list_c, asv_detected_c)
end
#asv_list
return asv_list, groups
end
function venn_plot(asv, category, category_label_size, output_file)
##plot veen diagram using R
R"""
library(ggVennDiagram); library(ggplot2)
#png("mtcars.png")
ggVennDiagram($asv,
label_alpha = 0,
label_percent_digit = 1,
category.names = $category,
edge_size = 0.5,
set_size = $category_label_size,
) +
ggplot2::scale_fill_gradient(low="white",high = "red") +
scale_x_continuous(expand = expansion(mult = c(.11)))
ggsave($output_file)
#dev.off()
"""
end
## to check ASV at one taxonomy level and with a term
rank = "Kingdom"
term = "Bacteria"
## input files
asv_in_file = "Immuno_8/dada2_asv_Immuno_8.csv"
taxa_in_file = "Immuno_8/dada2_taxa_names_Immuno_8.csv"
sample_in_file = "Immuno_8/samples2_Immuno8_v3.csv"
## run functions
(asv_in, taxa_in, sample_in) = input2df(asv_in_file, taxa_in_file, sample_in_file)
#treatment = ["FM control", "BSF 10", "BSF 30"]
#treatment = ["FM control + Immuno", "BSF 10 + Immuno", "BSF 30 + Immuno"]
#treatment = ["FM control"]
#treatment = ["BSF 10"]
#treatment = ["BSF 30"]
#treatment = ["FM control + Immuno"]
#treatment = ["BSF 10 + Immuno"]
treatment = ["BSF 30 + Immuno"]
(asv_p, category_p) = asv_taxa_extract(asv_in, taxa_in, sample_in, treatment);
if length(asv_p) >= 2
venn_plot(asv_p, category_p, 3, "BSF30_Immuno-samples-only.pdf")
else
println("Cannot find overlapping ASV between any two groups!")
end