Merge commit '2e071bdcdae8d5e6028b4e072b0236852a669592'

hicsail · Nov 25, 2024 · 919ce50 · 919ce50
2 parents 67871be + 2e071bd
commit 919ce50
Show file tree

Hide file tree

Showing 19 changed files with 117,730 additions and 18,572 deletions.
diff --git a/README.md b/README.md
@@ -4,10 +4,10 @@ Compilation of published genome-scale metabolic models (GEMs) for constraint-bas
 Each organism has its own directory. GEMs are uploaded in various formats depending on the original publication (usually XML/SBML). An R script was used to enforce a common nomenclature for all exchange metabolites (`01_modify_metabolites.r`). Python was used to verify COBRA compliance and ensure that growth rates match the rates described in publications. (`/02_make_cobra_compliant.ipynb`). GEMs have also been evaluated for growth on various carbon substrates (`evaluate_carbon_usage.ipynb`). 
 
 Simulation-ready GEMs can be integrated into community modeling frameworks of various flavors:<br>
- [![COBRApy](https://img.shields.io/badge/-COBRApy-028?&logo=GitHub)](https://github.com/opencobra/cobrapy)&nbsp; standard Flux-Balance Analysis (FBA), Flux-Variability Analysis (FVA) 
- <br>
-[![COMETSpy](https://img.shields.io/badge/-COMETSpy-028?&logo=GitHub)](https://github.com/segrelab/cometspy)&nbsp; dynamic FBA, spatiotemporal simulations
+    [![COBRApy](https://img.shields.io/badge/-COBRApy-028?&logo=GitHub)](https://github.com/opencobra/cobrapy)&nbsp; standard Flux-Balance Analysis (FBA), Flux-Variability Analysis (FVA) 
 <br>
-[![MICOM](https://img.shields.io/badge/-MICOM-028?&logo=GitHub)](https://github.com/micom-dev)&nbsp; community trade-offs, growth rate estimation
+    [![COMETSpy](https://img.shields.io/badge/-COMETSpy-028?&logo=GitHub)](https://github.com/segrelab/cometspy)&nbsp; dynamic FBA, spatiotemporal simulations
+<br>
+    [![MICOM](https://img.shields.io/badge/-MICOM-028?&logo=GitHub)](https://github.com/micom-dev)&nbsp; community trade-offs, growth rate estimation
 
 60+ more GEMs to come soon!
diff --git a/comets_shinyapp_example/01_merge_env_data.r b/comets_shinyapp_example/01_merge_env_data.r
@@ -0,0 +1,170 @@
+library(tidyverse)
+library(broom)
+
+setwd("/projectnb/frpmars/soil_microbe_db/")
+source("/projectnb/frpmars/soil_microbe_db/scripts/helper_functions.r")
+
+set.seed(1)
+# Read in microbe abundances, soil metadata, and GEM information
+species_abundances <- fread("./soil_microbe_db_abundances.csv")  
+# length(unique(species_abundances$taxonomy_id)) # 26204 unique taxa
+#%>% 
+	#select(-`...1`) %>% 
+	# Remove some taxon categories that we probably don't want (no viruses)
+#	filter(!taxon %in% c("Classified at a higher level","Unclassified")) %>% 
+#	filter(!grepl("virus", taxon))
+
+soil_metadata <- fread("../ref_data/environmental_metadata_NEON.csv") %>% 
+	dplyr::select(-V1) %>% 
+    mutate(sample_id = paste0(genomicsSampleID, "_soil_microbe_db_filtered")) %>% 
+    filter(sample_id %in% species_abundances$sample_id)
+
+# Combine into one dataframe 
+sample_abundance = merge(species_abundances %>% filter(!is.na(source)), 
+									soil_metadata, by.x = "sample_id", by.y = "sample_id") 
+sample_abundance$db_name = "soil_microbe_db"
+sample_abundance$taxon=sample_abundance$name
+
+
+organism_info = sample_abundance %>% distinct(source, taxonomy_id, is_MAG, taxid_lineage, lineage, name, taxon)
+write_csv(organism_info, "./intermediate_data/organism_info.csv")
+
+
+# Filter species by prevalence in at least 50 samples
+n_sample_prevalence = sample_abundance %>% group_by(name) %>% tally(name = "n_samples")
+sample_abundance <- left_join(sample_abundance, n_sample_prevalence)
+sample_abundance <- sample_abundance %>% filter(n_samples > 50)
+length(unique(sample_abundance$taxonomy_id)) # 18589 unique taxa
+write_csv(sample_abundance, "./sample_abundance_data.csv")
+
+
+sample_abundance_lean = sample_abundance %>% dplyr::select(taxonomy_id, percentage, sample_id)
+write_csv(sample_abundance_lean, "./intermediate_data/sample_abundance_data_lean.csv")
+
+env_data_lean = sample_abundance %>% dplyr::select(sample_id)
+write_csv(sample_abundance_lean, "./intermediate_data/sample_abundance_data_lean.csv")
+
+
+# Establish 
+merged_df_nest = merged_df %>% 
+	#select(-c(sampleID)) %>% 
+	group_by(db_name, taxon, taxonomy_id, lineage) %>% nest()
+
+filter_by_correlation = F
+
+if (filter_by_correlation==T){
+
+# Filter by correlation strength
+data_nest_pH <- merged_df_nest %>% 
+	mutate(model = map(data, cor_fun_pH)) %>% 
+	select(-data) %>% unnest(cols = c(model)) %>% 
+	filter(p.value < 0.0001 & abs(estimate) > .8 )
+species_of_interest_pH <- data_nest_pH$taxon
+
+data_nest_temperature <- merged_df_nest %>% 
+	mutate(model = map(data, cor_fun_temperature)) %>% 
+	select(-data) %>% unnest(cols = c(model)) %>% 
+	filter(p.value < 0.0001 & abs(estimate) > .3)
+species_of_interest_temperature <- data_nest_temperature$taxon
+
+# species_of_interest <- species_of_interest_carbon
+# species_of_interest <- species_of_interest_pH
+# species_of_interest <- species_of_interest_nitr
+
+species_of_interest = c(species_of_interest_pH, species_of_interest_temperature) %>% 
+	unique()
+
+} else species_of_interest = unique(merged_df_nest$taxon) %>% sample(5000)
+
+# Assign biome preferences for species with pH or temperature correlation
+data_nest_biome <- merged_df_nest %>% filter(taxon %in% species_of_interest) 
+
+# Do this in chunks otherwise the map() function freezes
+df_biome_pt1 <- data_nest_biome[1:1000,] %>% 
+	mutate(biome_presence = map(data, assign_biome_presence)) 
+
+df_biome_pt2 <- data_nest_biome[1001:3000,] %>% 
+	mutate(biome_presence = map(data, assign_biome_presence)) 
+
+df_biome_pt3 <- data_nest_biome[3001:5000,] %>% 
+	mutate(biome_presence = map(data, assign_biome_presence)) 
+
+# df_biome_pt4 <- data_nest_biome[5001:nrow(data_nest_biome),] %>% 
+# 	mutate(biome_presence = map(data, assign_biome_presence)) 
+
+#df_biome = df_biome_pt1
+df_biome = data.table::rbindlist(list(df_biome_pt1, df_biome_pt2, df_biome_pt3)) %>% 
+	select(-data) %>% unnest(cols = c(biome_presence)) %>% ungroup 
+
+df_biome_wide = df_biome %>% 
+	select(-c(biome_count, prevalence)) %>% 
+	pivot_wider(names_from = nlcdClass, values_from = present_in_biome)
+
+
+
+# Loop through GEM data and match by species or genus
+gem_out = list()
+for (i in 1:length(species_of_interest)){
+	print(species_of_interest[[i]])
+	spec = species_of_interest[[i]]
+	genus = word(spec)
+	if (spec %in% soil_GEM_info$Species) {
+		match_out=soil_GEM_info[which(soil_GEM_info$Species==spec),]
+		match_out$match_by="Species"
+	} else if (genus %in% soil_GEM_info$Genus) {
+		match_out=soil_GEM_info[which(soil_GEM_info$Genus==genus),]
+		match_out$match_by="Genus"
+	} else {
+		match_out = data.frame(GEM_ID = NA,Kingdom=NA,Genus=NA,Species.strain=NA,
+													 Citation=NA,Source=NA,Filepath=NA,
+													 Species = NA,
+													 match_by="No curated GEM at species or genus level")
+	}
+	match_out <- match_out %>% select(-Species)
+	match_out$Species_of_interest = spec
+gem_out[[i]] = match_out
+}
+
+gem_out_table = data.table::rbindlist(gem_out)
+
+# gem_out_table$pH_significant = ifelse(gem_out_table$Species_of_interest %in% species_of_interest_pH, "Yes", "No")
+# gem_out_table$temp_significant = ifelse(gem_out_table$Species_of_interest %in% species_of_interest_temperature, "Yes", "No")
+
+
+functional_gems <- c("hanpo","iAA1300","iAF692","iFC579","iHN637","iJN7462","iRi1574","iSB619","iYO844","iGC535","iBM3063","iBB1018","iMG746","iMM904","iNmo686","iYY1101","iGD1348","iCY1106","iRhto1108")
+gem_out_table$Functional = ifelse(gem_out_table$GEM_ID %in% functional_gems, "Yes", "No")
+
+gem_out_table <- left_join(gem_out_table, df_biome_wide, join_by(Species_of_interest == taxon))
+
+gem_out_table = gem_out_table %>% 
+    filter(db_name == "soil_microbe_db") %>% 
+	#arrange(Functional) %>% 
+	#	arrange(GEM_ID)
+	rename("Species of interest" = Species_of_interest, 
+				 "Match criteria" = match_by, 
+				 "GEM ID" = GEM_ID, 
+				 #Filepath, 
+				 "Functional in COMETS?" = Functional#, 
+				 #"Temperature sensitive" = temp_significant,
+				 #"pH sensitive" = pH_significant
+				 ) %>% 
+	arrange(desc(`Functional in COMETS?`)) 
+#formattable::formattable(gem_out_print)
+
+gem_out_table <- left_join(gem_out_table, merged_df %>% select(`Species of interest`=name,taxonomy_id, source) %>% distinct())
+
+write.csv(gem_out_table, "./species_list_for_smartCOMETS.csv")
+
+# write.csv(merged_df %>% filter(db_name == "soil_microbe_db"), 
+# 					"/projectnb/frpmars/soil_microbe_db/shiny_app/species_abundance.csv")
+
+# # Filter to just 5000 random organisms, to make data smaller for now
+abundance_data_filt = merged_df %>%
+	filter(taxon %in% species_of_interest)
+
+write.csv(abundance_data_filt, "/projectnb2/talbot-lab-data/zrwerbin/soil_microbe_GEMs/comets_shinyapp_example/species_abundance_filt.csv")
+
+
+# cp '/projectnb/frpmars/soil_microbe_db/Soil GEMs - manually curated.csv' /projectnb2/talbot-lab-data/zrwerbin/soil_microbe_GEMs/comets_shinyapp_example/
+# cp /projectnb/frpmars/soil_microbe_db/species_list_for_smartCOMETS.csv /projectnb2/talbot-lab-data/zrwerbin/soil_microbe_GEMs/comets_shinyapp_example/
+
diff --git a/comets_shinyapp_example/02_determine_biome_preference.r b/comets_shinyapp_example/02_determine_biome_preference.r
@@ -0,0 +1,97 @@
+
+# First run 01_merge_env_data.r
+
+library(tidyverse)
+library(data.table)
+library(broom)
+
+# For future_map
+library(segmented)
+library(furrr)
+plan(strategy = "multisession")
+
+
+#setwd("/projectnb/frpmars/soil_microbe_db/")
+
+# Function to assign biome preference to soil organisms based on prevalence 
+assign_biome_presence = function(taxon_df, prevalence_cutoff=.05) {
+
+    # get counts of samples per biome 
+    sample_counts = taxon_df %>% group_by(nlcdClass) %>% tally(name = "biome_count")
+
+    # get counts of non-zero abundances per biome
+    presence_counts = taxon_df %>% 
+        mutate(present = ifelse(percentage > 0, 1, 0)) %>% 
+        group_by(nlcdClass, present) %>% tally(name = "presence_count")
+
+    # ensure all combinations are present
+    presence_counts_expanded = presence_counts %>% 
+        ungroup %>%  
+        expand(nlcdClass, present)
+    # set NA values to zero
+    presence_counts_expanded = merge(presence_counts, presence_counts_expanded, all=T) %>% 
+        replace_na(list(presence_count = 0))
+
+    # Get prevalence (percent of biome samples with nonzero abundance)
+    df_merged = merge(presence_counts_expanded, sample_counts, all=T) %>% 
+        filter(present == 1) %>% 
+        mutate(prevalence = presence_count/biome_count,
+               present_in_biome = ifelse(prevalence > prevalence_cutoff, 1, 0))
+
+    df_out = df_merged  %>% 
+        dplyr::select(nlcdClass, biome_count, prevalence, present_in_biome)
+    return(df_out)
+}
+
+# Read in sample-level abundances with biome info
+sample_abundance <- fread("./sample_abundance_data.csv")
+
+# Nest by taxon before running function
+sample_abundance_nest = sample_abundance %>% 
+    group_by(db_name, taxon, taxonomy_id, lineage) %>% nest()
+
+#species_of_interest = unique(merged_df_nest$taxon) %>% sample(1000)
+#data_nest_biome <- merged_df_nest %>% filter(taxon %in% species_of_interest) 
+
+# Do this in chunks otherwise the map() function freezes
+df_biome_pt1 <- sample_abundance_nest[1:5000,] %>% 
+    mutate(biome_presence = future_map(data, assign_biome_presence)) 
+
+df_biome_pt2 <- sample_abundance_nest[5001:10000,] %>% 
+    mutate(biome_presence = future_map(data, assign_biome_presence)) 
+
+df_biome_pt3 <- sample_abundance_nest[10001:nrow(sample_abundance_nest),] %>% 
+    mutate(biome_presence = future_map(data, assign_biome_presence)) 
+
+# Combine and unlist
+df_biome = data.table::rbindlist(list(df_biome_pt1, df_biome_pt2, df_biome_pt3)) %>% 
+    dplyr::select(-data) %>% 
+    unnest(cols = c(biome_presence)) %>% 
+    ungroup 
+
+# Create separate column for each biome's "present in biome" (true/false = 1/0)
+df_biome_wide = df_biome %>% 
+    dplyr::select(-c(biome_count, prevalence)) %>% 
+    pivot_wider(names_from = nlcdClass, values_from = present_in_biome)
+
+#nrow(df_biome_wide) # 18576 taxa
+write_csv(df_biome_wide, "./intermediate_data/organism_biome_preference.csv")
+
+
+
+
+table(df_biome_wide$cultivatedCrops)
+df_biome %>% group_by(nlcdClass, present_in_biome) %>% tally()
+
+head(df_biome_pt1$biome_presence)
+
+p3 = ggplot(merged_df %>% 
+                dplyr::filter(taxon %in% species_of_interest_carbon[1:4]),
+            aes(x = organicd13C, y = percentage, color=taxon)) +
+    geom_point(alpha=.5, 
+               position=position_jitter(width = .01, height=0), size=2, 
+               show.legend = F) + 
+    geom_smooth(show.legend = F) +
+    facet_wrap(~taxon, scales = "free") + theme_bw(base_size = 22) + 
+    scale_y_sqrt() + xlab("Soil organic carbon") + ylab("Microbial abundance")
+
diff --git a/comets_shinyapp_example/03_determine_pH_temp_preference.r b/comets_shinyapp_example/03_determine_pH_temp_preference.r
@@ -0,0 +1,36 @@
+library(tidyverse)
+library(broom)
+library(data.table)
+
+# First run 01_merge_env_data.r
+
+# Read in sample-level abundances with biome info
+sample_abundance <- fread("./sample_abundance_data.csv")
+
+
+# Function to find the peak pH/temperature value
+loess_maximum <- function(x, y, tol = .Machine$double.eps^0.5){
+    model <- loess(y ~ x,span = 0.8)
+    yfit <- model$fitted
+    inx <- which(abs(yfit - max(yfit)) < tol)[[1]]
+    list(x = x[inx], y.fitted = yfit[inx], ix = inx)
+}
+
+# Get pH of maximum abundance trends per taxon
+pH_max = sample_abundance %>% 
+    group_by(taxonomy_id, taxon) %>% 
+    filter(!is.na(soilInCaClpH)) %>% 
+    summarize(pH_preference = loess_maximum(soilInCaClpH, percentage)[[1]]) %>% 
+    rename("Species of interest" = taxon)
+
+# Get temp of maximum abundance trends per taxon
+temp_max = sample_abundance %>% 
+    group_by(taxonomy_id, taxon) %>% 
+    filter(!is.na(soilTemp)) %>% 
+    summarize(temperature_preference = loess_maximum(soilTemp, percentage)[[1]])  %>% 
+    rename("Species of interest" = taxon)
+
+# Combine and save
+organism_preference = full_join(temp_max, pH_max)
+write_csv(organism_preference, "./intermediate_data/organism_pH_temp_preference.csv")
+