forked from zoey-rw/soil_microbe_GEMs
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge commit '2e071bdcdae8d5e6028b4e072b0236852a669592'
- Loading branch information
Showing
19 changed files
with
117,730 additions
and
18,572 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
library(tidyverse) | ||
library(broom) | ||
|
||
setwd("/projectnb/frpmars/soil_microbe_db/") | ||
source("/projectnb/frpmars/soil_microbe_db/scripts/helper_functions.r") | ||
|
||
set.seed(1) | ||
# Read in microbe abundances, soil metadata, and GEM information | ||
species_abundances <- fread("./soil_microbe_db_abundances.csv") | ||
# length(unique(species_abundances$taxonomy_id)) # 26204 unique taxa | ||
#%>% | ||
#select(-`...1`) %>% | ||
# Remove some taxon categories that we probably don't want (no viruses) | ||
# filter(!taxon %in% c("Classified at a higher level","Unclassified")) %>% | ||
# filter(!grepl("virus", taxon)) | ||
|
||
soil_metadata <- fread("../ref_data/environmental_metadata_NEON.csv") %>% | ||
dplyr::select(-V1) %>% | ||
mutate(sample_id = paste0(genomicsSampleID, "_soil_microbe_db_filtered")) %>% | ||
filter(sample_id %in% species_abundances$sample_id) | ||
|
||
# Combine into one dataframe | ||
sample_abundance = merge(species_abundances %>% filter(!is.na(source)), | ||
soil_metadata, by.x = "sample_id", by.y = "sample_id") | ||
sample_abundance$db_name = "soil_microbe_db" | ||
sample_abundance$taxon=sample_abundance$name | ||
|
||
|
||
organism_info = sample_abundance %>% distinct(source, taxonomy_id, is_MAG, taxid_lineage, lineage, name, taxon) | ||
write_csv(organism_info, "./intermediate_data/organism_info.csv") | ||
|
||
|
||
# Filter species by prevalence in at least 50 samples | ||
n_sample_prevalence = sample_abundance %>% group_by(name) %>% tally(name = "n_samples") | ||
sample_abundance <- left_join(sample_abundance, n_sample_prevalence) | ||
sample_abundance <- sample_abundance %>% filter(n_samples > 50) | ||
length(unique(sample_abundance$taxonomy_id)) # 18589 unique taxa | ||
write_csv(sample_abundance, "./sample_abundance_data.csv") | ||
|
||
|
||
sample_abundance_lean = sample_abundance %>% dplyr::select(taxonomy_id, percentage, sample_id) | ||
write_csv(sample_abundance_lean, "./intermediate_data/sample_abundance_data_lean.csv") | ||
|
||
env_data_lean = sample_abundance %>% dplyr::select(sample_id) | ||
write_csv(sample_abundance_lean, "./intermediate_data/sample_abundance_data_lean.csv") | ||
|
||
|
||
# Establish | ||
merged_df_nest = merged_df %>% | ||
#select(-c(sampleID)) %>% | ||
group_by(db_name, taxon, taxonomy_id, lineage) %>% nest() | ||
|
||
filter_by_correlation = F | ||
|
||
if (filter_by_correlation==T){ | ||
|
||
# Filter by correlation strength | ||
data_nest_pH <- merged_df_nest %>% | ||
mutate(model = map(data, cor_fun_pH)) %>% | ||
select(-data) %>% unnest(cols = c(model)) %>% | ||
filter(p.value < 0.0001 & abs(estimate) > .8 ) | ||
species_of_interest_pH <- data_nest_pH$taxon | ||
|
||
data_nest_temperature <- merged_df_nest %>% | ||
mutate(model = map(data, cor_fun_temperature)) %>% | ||
select(-data) %>% unnest(cols = c(model)) %>% | ||
filter(p.value < 0.0001 & abs(estimate) > .3) | ||
species_of_interest_temperature <- data_nest_temperature$taxon | ||
|
||
# species_of_interest <- species_of_interest_carbon | ||
# species_of_interest <- species_of_interest_pH | ||
# species_of_interest <- species_of_interest_nitr | ||
|
||
species_of_interest = c(species_of_interest_pH, species_of_interest_temperature) %>% | ||
unique() | ||
|
||
} else species_of_interest = unique(merged_df_nest$taxon) %>% sample(5000) | ||
|
||
# Assign biome preferences for species with pH or temperature correlation | ||
data_nest_biome <- merged_df_nest %>% filter(taxon %in% species_of_interest) | ||
|
||
# Do this in chunks otherwise the map() function freezes | ||
df_biome_pt1 <- data_nest_biome[1:1000,] %>% | ||
mutate(biome_presence = map(data, assign_biome_presence)) | ||
|
||
df_biome_pt2 <- data_nest_biome[1001:3000,] %>% | ||
mutate(biome_presence = map(data, assign_biome_presence)) | ||
|
||
df_biome_pt3 <- data_nest_biome[3001:5000,] %>% | ||
mutate(biome_presence = map(data, assign_biome_presence)) | ||
|
||
# df_biome_pt4 <- data_nest_biome[5001:nrow(data_nest_biome),] %>% | ||
# mutate(biome_presence = map(data, assign_biome_presence)) | ||
|
||
#df_biome = df_biome_pt1 | ||
df_biome = data.table::rbindlist(list(df_biome_pt1, df_biome_pt2, df_biome_pt3)) %>% | ||
select(-data) %>% unnest(cols = c(biome_presence)) %>% ungroup | ||
|
||
df_biome_wide = df_biome %>% | ||
select(-c(biome_count, prevalence)) %>% | ||
pivot_wider(names_from = nlcdClass, values_from = present_in_biome) | ||
|
||
|
||
|
||
# Loop through GEM data and match by species or genus | ||
gem_out = list() | ||
for (i in 1:length(species_of_interest)){ | ||
print(species_of_interest[[i]]) | ||
spec = species_of_interest[[i]] | ||
genus = word(spec) | ||
if (spec %in% soil_GEM_info$Species) { | ||
match_out=soil_GEM_info[which(soil_GEM_info$Species==spec),] | ||
match_out$match_by="Species" | ||
} else if (genus %in% soil_GEM_info$Genus) { | ||
match_out=soil_GEM_info[which(soil_GEM_info$Genus==genus),] | ||
match_out$match_by="Genus" | ||
} else { | ||
match_out = data.frame(GEM_ID = NA,Kingdom=NA,Genus=NA,Species.strain=NA, | ||
Citation=NA,Source=NA,Filepath=NA, | ||
Species = NA, | ||
match_by="No curated GEM at species or genus level") | ||
} | ||
match_out <- match_out %>% select(-Species) | ||
match_out$Species_of_interest = spec | ||
gem_out[[i]] = match_out | ||
} | ||
|
||
gem_out_table = data.table::rbindlist(gem_out) | ||
|
||
# gem_out_table$pH_significant = ifelse(gem_out_table$Species_of_interest %in% species_of_interest_pH, "Yes", "No") | ||
# gem_out_table$temp_significant = ifelse(gem_out_table$Species_of_interest %in% species_of_interest_temperature, "Yes", "No") | ||
|
||
|
||
functional_gems <- c("hanpo","iAA1300","iAF692","iFC579","iHN637","iJN7462","iRi1574","iSB619","iYO844","iGC535","iBM3063","iBB1018","iMG746","iMM904","iNmo686","iYY1101","iGD1348","iCY1106","iRhto1108") | ||
gem_out_table$Functional = ifelse(gem_out_table$GEM_ID %in% functional_gems, "Yes", "No") | ||
|
||
gem_out_table <- left_join(gem_out_table, df_biome_wide, join_by(Species_of_interest == taxon)) | ||
|
||
gem_out_table = gem_out_table %>% | ||
filter(db_name == "soil_microbe_db") %>% | ||
#arrange(Functional) %>% | ||
# arrange(GEM_ID) | ||
rename("Species of interest" = Species_of_interest, | ||
"Match criteria" = match_by, | ||
"GEM ID" = GEM_ID, | ||
#Filepath, | ||
"Functional in COMETS?" = Functional#, | ||
#"Temperature sensitive" = temp_significant, | ||
#"pH sensitive" = pH_significant | ||
) %>% | ||
arrange(desc(`Functional in COMETS?`)) | ||
#formattable::formattable(gem_out_print) | ||
|
||
gem_out_table <- left_join(gem_out_table, merged_df %>% select(`Species of interest`=name,taxonomy_id, source) %>% distinct()) | ||
|
||
write.csv(gem_out_table, "./species_list_for_smartCOMETS.csv") | ||
|
||
# write.csv(merged_df %>% filter(db_name == "soil_microbe_db"), | ||
# "/projectnb/frpmars/soil_microbe_db/shiny_app/species_abundance.csv") | ||
|
||
# # Filter to just 5000 random organisms, to make data smaller for now | ||
abundance_data_filt = merged_df %>% | ||
filter(taxon %in% species_of_interest) | ||
|
||
write.csv(abundance_data_filt, "/projectnb2/talbot-lab-data/zrwerbin/soil_microbe_GEMs/comets_shinyapp_example/species_abundance_filt.csv") | ||
|
||
|
||
# cp '/projectnb/frpmars/soil_microbe_db/Soil GEMs - manually curated.csv' /projectnb2/talbot-lab-data/zrwerbin/soil_microbe_GEMs/comets_shinyapp_example/ | ||
# cp /projectnb/frpmars/soil_microbe_db/species_list_for_smartCOMETS.csv /projectnb2/talbot-lab-data/zrwerbin/soil_microbe_GEMs/comets_shinyapp_example/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
|
||
# First run 01_merge_env_data.r | ||
|
||
library(tidyverse) | ||
library(data.table) | ||
library(broom) | ||
|
||
# For future_map | ||
library(segmented) | ||
library(furrr) | ||
plan(strategy = "multisession") | ||
|
||
|
||
#setwd("/projectnb/frpmars/soil_microbe_db/") | ||
|
||
# Function to assign biome preference to soil organisms based on prevalence | ||
assign_biome_presence = function(taxon_df, prevalence_cutoff=.05) { | ||
|
||
# get counts of samples per biome | ||
sample_counts = taxon_df %>% group_by(nlcdClass) %>% tally(name = "biome_count") | ||
|
||
# get counts of non-zero abundances per biome | ||
presence_counts = taxon_df %>% | ||
mutate(present = ifelse(percentage > 0, 1, 0)) %>% | ||
group_by(nlcdClass, present) %>% tally(name = "presence_count") | ||
|
||
# ensure all combinations are present | ||
presence_counts_expanded = presence_counts %>% | ||
ungroup %>% | ||
expand(nlcdClass, present) | ||
# set NA values to zero | ||
presence_counts_expanded = merge(presence_counts, presence_counts_expanded, all=T) %>% | ||
replace_na(list(presence_count = 0)) | ||
|
||
# Get prevalence (percent of biome samples with nonzero abundance) | ||
df_merged = merge(presence_counts_expanded, sample_counts, all=T) %>% | ||
filter(present == 1) %>% | ||
mutate(prevalence = presence_count/biome_count, | ||
present_in_biome = ifelse(prevalence > prevalence_cutoff, 1, 0)) | ||
|
||
df_out = df_merged %>% | ||
dplyr::select(nlcdClass, biome_count, prevalence, present_in_biome) | ||
return(df_out) | ||
} | ||
|
||
# Read in sample-level abundances with biome info | ||
sample_abundance <- fread("./sample_abundance_data.csv") | ||
|
||
# Nest by taxon before running function | ||
sample_abundance_nest = sample_abundance %>% | ||
group_by(db_name, taxon, taxonomy_id, lineage) %>% nest() | ||
|
||
#species_of_interest = unique(merged_df_nest$taxon) %>% sample(1000) | ||
#data_nest_biome <- merged_df_nest %>% filter(taxon %in% species_of_interest) | ||
|
||
# Do this in chunks otherwise the map() function freezes | ||
df_biome_pt1 <- sample_abundance_nest[1:5000,] %>% | ||
mutate(biome_presence = future_map(data, assign_biome_presence)) | ||
|
||
df_biome_pt2 <- sample_abundance_nest[5001:10000,] %>% | ||
mutate(biome_presence = future_map(data, assign_biome_presence)) | ||
|
||
df_biome_pt3 <- sample_abundance_nest[10001:nrow(sample_abundance_nest),] %>% | ||
mutate(biome_presence = future_map(data, assign_biome_presence)) | ||
|
||
# Combine and unlist | ||
df_biome = data.table::rbindlist(list(df_biome_pt1, df_biome_pt2, df_biome_pt3)) %>% | ||
dplyr::select(-data) %>% | ||
unnest(cols = c(biome_presence)) %>% | ||
ungroup | ||
|
||
# Create separate column for each biome's "present in biome" (true/false = 1/0) | ||
df_biome_wide = df_biome %>% | ||
dplyr::select(-c(biome_count, prevalence)) %>% | ||
pivot_wider(names_from = nlcdClass, values_from = present_in_biome) | ||
|
||
#nrow(df_biome_wide) # 18576 taxa | ||
write_csv(df_biome_wide, "./intermediate_data/organism_biome_preference.csv") | ||
|
||
|
||
|
||
|
||
table(df_biome_wide$cultivatedCrops) | ||
df_biome %>% group_by(nlcdClass, present_in_biome) %>% tally() | ||
|
||
head(df_biome_pt1$biome_presence) | ||
|
||
p3 = ggplot(merged_df %>% | ||
dplyr::filter(taxon %in% species_of_interest_carbon[1:4]), | ||
aes(x = organicd13C, y = percentage, color=taxon)) + | ||
geom_point(alpha=.5, | ||
position=position_jitter(width = .01, height=0), size=2, | ||
show.legend = F) + | ||
geom_smooth(show.legend = F) + | ||
facet_wrap(~taxon, scales = "free") + theme_bw(base_size = 22) + | ||
scale_y_sqrt() + xlab("Soil organic carbon") + ylab("Microbial abundance") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
library(tidyverse) | ||
library(broom) | ||
library(data.table) | ||
|
||
# First run 01_merge_env_data.r | ||
|
||
# Read in sample-level abundances with biome info | ||
sample_abundance <- fread("./sample_abundance_data.csv") | ||
|
||
|
||
# Function to find the peak pH/temperature value | ||
loess_maximum <- function(x, y, tol = .Machine$double.eps^0.5){ | ||
model <- loess(y ~ x,span = 0.8) | ||
yfit <- model$fitted | ||
inx <- which(abs(yfit - max(yfit)) < tol)[[1]] | ||
list(x = x[inx], y.fitted = yfit[inx], ix = inx) | ||
} | ||
|
||
# Get pH of maximum abundance trends per taxon | ||
pH_max = sample_abundance %>% | ||
group_by(taxonomy_id, taxon) %>% | ||
filter(!is.na(soilInCaClpH)) %>% | ||
summarize(pH_preference = loess_maximum(soilInCaClpH, percentage)[[1]]) %>% | ||
rename("Species of interest" = taxon) | ||
|
||
# Get temp of maximum abundance trends per taxon | ||
temp_max = sample_abundance %>% | ||
group_by(taxonomy_id, taxon) %>% | ||
filter(!is.na(soilTemp)) %>% | ||
summarize(temperature_preference = loess_maximum(soilTemp, percentage)[[1]]) %>% | ||
rename("Species of interest" = taxon) | ||
|
||
# Combine and save | ||
organism_preference = full_join(temp_max, pH_max) | ||
write_csv(organism_preference, "./intermediate_data/organism_pH_temp_preference.csv") | ||
|
Oops, something went wrong.