Skip to content

Commit

Permalink
Merge commit '2e071bdcdae8d5e6028b4e072b0236852a669592'
Browse files Browse the repository at this point in the history
  • Loading branch information
gregfrasco committed Nov 25, 2024
2 parents 67871be + 2e071bd commit 919ce50
Show file tree
Hide file tree
Showing 19 changed files with 117,730 additions and 18,572 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ Compilation of published genome-scale metabolic models (GEMs) for constraint-bas
Each organism has its own directory. GEMs are uploaded in various formats depending on the original publication (usually XML/SBML). An R script was used to enforce a common nomenclature for all exchange metabolites (`01_modify_metabolites.r`). Python was used to verify COBRA compliance and ensure that growth rates match the rates described in publications. (`/02_make_cobra_compliant.ipynb`). GEMs have also been evaluated for growth on various carbon substrates (`evaluate_carbon_usage.ipynb`).

Simulation-ready GEMs can be integrated into community modeling frameworks of various flavors:<br>
[![COBRApy](https://img.shields.io/badge/-COBRApy-028?&logo=GitHub)](https://github.com/opencobra/cobrapy)&nbsp; standard Flux-Balance Analysis (FBA), Flux-Variability Analysis (FVA)
<br>
[![COMETSpy](https://img.shields.io/badge/-COMETSpy-028?&logo=GitHub)](https://github.com/segrelab/cometspy)&nbsp; dynamic FBA, spatiotemporal simulations
[![COBRApy](https://img.shields.io/badge/-COBRApy-028?&logo=GitHub)](https://github.com/opencobra/cobrapy)&nbsp; standard Flux-Balance Analysis (FBA), Flux-Variability Analysis (FVA)
<br>
[![MICOM](https://img.shields.io/badge/-MICOM-028?&logo=GitHub)](https://github.com/micom-dev)&nbsp; community trade-offs, growth rate estimation
[![COMETSpy](https://img.shields.io/badge/-COMETSpy-028?&logo=GitHub)](https://github.com/segrelab/cometspy)&nbsp; dynamic FBA, spatiotemporal simulations
<br>
[![MICOM](https://img.shields.io/badge/-MICOM-028?&logo=GitHub)](https://github.com/micom-dev)&nbsp; community trade-offs, growth rate estimation

60+ more GEMs to come soon!
170 changes: 170 additions & 0 deletions comets_shinyapp_example/01_merge_env_data.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
library(tidyverse)
library(broom)

setwd("/projectnb/frpmars/soil_microbe_db/")
source("/projectnb/frpmars/soil_microbe_db/scripts/helper_functions.r")

set.seed(1)
# Read in microbe abundances, soil metadata, and GEM information
species_abundances <- fread("./soil_microbe_db_abundances.csv")
# length(unique(species_abundances$taxonomy_id)) # 26204 unique taxa
#%>%
#select(-`...1`) %>%
# Remove some taxon categories that we probably don't want (no viruses)
# filter(!taxon %in% c("Classified at a higher level","Unclassified")) %>%
# filter(!grepl("virus", taxon))

soil_metadata <- fread("../ref_data/environmental_metadata_NEON.csv") %>%
dplyr::select(-V1) %>%
mutate(sample_id = paste0(genomicsSampleID, "_soil_microbe_db_filtered")) %>%
filter(sample_id %in% species_abundances$sample_id)

# Combine into one dataframe
sample_abundance = merge(species_abundances %>% filter(!is.na(source)),
soil_metadata, by.x = "sample_id", by.y = "sample_id")
sample_abundance$db_name = "soil_microbe_db"
sample_abundance$taxon=sample_abundance$name


organism_info = sample_abundance %>% distinct(source, taxonomy_id, is_MAG, taxid_lineage, lineage, name, taxon)
write_csv(organism_info, "./intermediate_data/organism_info.csv")


# Filter species by prevalence in at least 50 samples
n_sample_prevalence = sample_abundance %>% group_by(name) %>% tally(name = "n_samples")
sample_abundance <- left_join(sample_abundance, n_sample_prevalence)
sample_abundance <- sample_abundance %>% filter(n_samples > 50)
length(unique(sample_abundance$taxonomy_id)) # 18589 unique taxa
write_csv(sample_abundance, "./sample_abundance_data.csv")


sample_abundance_lean = sample_abundance %>% dplyr::select(taxonomy_id, percentage, sample_id)
write_csv(sample_abundance_lean, "./intermediate_data/sample_abundance_data_lean.csv")

env_data_lean = sample_abundance %>% dplyr::select(sample_id)
write_csv(sample_abundance_lean, "./intermediate_data/sample_abundance_data_lean.csv")


# Establish
merged_df_nest = merged_df %>%
#select(-c(sampleID)) %>%
group_by(db_name, taxon, taxonomy_id, lineage) %>% nest()

filter_by_correlation = F

if (filter_by_correlation==T){

# Filter by correlation strength
data_nest_pH <- merged_df_nest %>%
mutate(model = map(data, cor_fun_pH)) %>%
select(-data) %>% unnest(cols = c(model)) %>%
filter(p.value < 0.0001 & abs(estimate) > .8 )
species_of_interest_pH <- data_nest_pH$taxon

data_nest_temperature <- merged_df_nest %>%
mutate(model = map(data, cor_fun_temperature)) %>%
select(-data) %>% unnest(cols = c(model)) %>%
filter(p.value < 0.0001 & abs(estimate) > .3)
species_of_interest_temperature <- data_nest_temperature$taxon

# species_of_interest <- species_of_interest_carbon
# species_of_interest <- species_of_interest_pH
# species_of_interest <- species_of_interest_nitr

species_of_interest = c(species_of_interest_pH, species_of_interest_temperature) %>%
unique()

} else species_of_interest = unique(merged_df_nest$taxon) %>% sample(5000)

# Assign biome preferences for species with pH or temperature correlation
data_nest_biome <- merged_df_nest %>% filter(taxon %in% species_of_interest)

# Do this in chunks otherwise the map() function freezes
df_biome_pt1 <- data_nest_biome[1:1000,] %>%
mutate(biome_presence = map(data, assign_biome_presence))

df_biome_pt2 <- data_nest_biome[1001:3000,] %>%
mutate(biome_presence = map(data, assign_biome_presence))

df_biome_pt3 <- data_nest_biome[3001:5000,] %>%
mutate(biome_presence = map(data, assign_biome_presence))

# df_biome_pt4 <- data_nest_biome[5001:nrow(data_nest_biome),] %>%
# mutate(biome_presence = map(data, assign_biome_presence))

#df_biome = df_biome_pt1
df_biome = data.table::rbindlist(list(df_biome_pt1, df_biome_pt2, df_biome_pt3)) %>%
select(-data) %>% unnest(cols = c(biome_presence)) %>% ungroup

df_biome_wide = df_biome %>%
select(-c(biome_count, prevalence)) %>%
pivot_wider(names_from = nlcdClass, values_from = present_in_biome)



# Loop through GEM data and match by species or genus
gem_out = list()
for (i in 1:length(species_of_interest)){
print(species_of_interest[[i]])
spec = species_of_interest[[i]]
genus = word(spec)
if (spec %in% soil_GEM_info$Species) {
match_out=soil_GEM_info[which(soil_GEM_info$Species==spec),]
match_out$match_by="Species"
} else if (genus %in% soil_GEM_info$Genus) {
match_out=soil_GEM_info[which(soil_GEM_info$Genus==genus),]
match_out$match_by="Genus"
} else {
match_out = data.frame(GEM_ID = NA,Kingdom=NA,Genus=NA,Species.strain=NA,
Citation=NA,Source=NA,Filepath=NA,
Species = NA,
match_by="No curated GEM at species or genus level")
}
match_out <- match_out %>% select(-Species)
match_out$Species_of_interest = spec
gem_out[[i]] = match_out
}

gem_out_table = data.table::rbindlist(gem_out)

# gem_out_table$pH_significant = ifelse(gem_out_table$Species_of_interest %in% species_of_interest_pH, "Yes", "No")
# gem_out_table$temp_significant = ifelse(gem_out_table$Species_of_interest %in% species_of_interest_temperature, "Yes", "No")


functional_gems <- c("hanpo","iAA1300","iAF692","iFC579","iHN637","iJN7462","iRi1574","iSB619","iYO844","iGC535","iBM3063","iBB1018","iMG746","iMM904","iNmo686","iYY1101","iGD1348","iCY1106","iRhto1108")
gem_out_table$Functional = ifelse(gem_out_table$GEM_ID %in% functional_gems, "Yes", "No")

gem_out_table <- left_join(gem_out_table, df_biome_wide, join_by(Species_of_interest == taxon))

gem_out_table = gem_out_table %>%
filter(db_name == "soil_microbe_db") %>%
#arrange(Functional) %>%
# arrange(GEM_ID)
rename("Species of interest" = Species_of_interest,
"Match criteria" = match_by,
"GEM ID" = GEM_ID,
#Filepath,
"Functional in COMETS?" = Functional#,
#"Temperature sensitive" = temp_significant,
#"pH sensitive" = pH_significant
) %>%
arrange(desc(`Functional in COMETS?`))
#formattable::formattable(gem_out_print)

gem_out_table <- left_join(gem_out_table, merged_df %>% select(`Species of interest`=name,taxonomy_id, source) %>% distinct())

write.csv(gem_out_table, "./species_list_for_smartCOMETS.csv")

# write.csv(merged_df %>% filter(db_name == "soil_microbe_db"),
# "/projectnb/frpmars/soil_microbe_db/shiny_app/species_abundance.csv")

# # Filter to just 5000 random organisms, to make data smaller for now
abundance_data_filt = merged_df %>%
filter(taxon %in% species_of_interest)

write.csv(abundance_data_filt, "/projectnb2/talbot-lab-data/zrwerbin/soil_microbe_GEMs/comets_shinyapp_example/species_abundance_filt.csv")


# cp '/projectnb/frpmars/soil_microbe_db/Soil GEMs - manually curated.csv' /projectnb2/talbot-lab-data/zrwerbin/soil_microbe_GEMs/comets_shinyapp_example/
# cp /projectnb/frpmars/soil_microbe_db/species_list_for_smartCOMETS.csv /projectnb2/talbot-lab-data/zrwerbin/soil_microbe_GEMs/comets_shinyapp_example/

97 changes: 97 additions & 0 deletions comets_shinyapp_example/02_determine_biome_preference.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@

# First run 01_merge_env_data.r

library(tidyverse)
library(data.table)
library(broom)

# For future_map
library(segmented)
library(furrr)
plan(strategy = "multisession")


#setwd("/projectnb/frpmars/soil_microbe_db/")

# Function to assign biome preference to soil organisms based on prevalence
assign_biome_presence = function(taxon_df, prevalence_cutoff=.05) {

# get counts of samples per biome
sample_counts = taxon_df %>% group_by(nlcdClass) %>% tally(name = "biome_count")

# get counts of non-zero abundances per biome
presence_counts = taxon_df %>%
mutate(present = ifelse(percentage > 0, 1, 0)) %>%
group_by(nlcdClass, present) %>% tally(name = "presence_count")

# ensure all combinations are present
presence_counts_expanded = presence_counts %>%
ungroup %>%
expand(nlcdClass, present)
# set NA values to zero
presence_counts_expanded = merge(presence_counts, presence_counts_expanded, all=T) %>%
replace_na(list(presence_count = 0))

# Get prevalence (percent of biome samples with nonzero abundance)
df_merged = merge(presence_counts_expanded, sample_counts, all=T) %>%
filter(present == 1) %>%
mutate(prevalence = presence_count/biome_count,
present_in_biome = ifelse(prevalence > prevalence_cutoff, 1, 0))

df_out = df_merged %>%
dplyr::select(nlcdClass, biome_count, prevalence, present_in_biome)
return(df_out)
}

# Read in sample-level abundances with biome info
sample_abundance <- fread("./sample_abundance_data.csv")

# Nest by taxon before running function
sample_abundance_nest = sample_abundance %>%
group_by(db_name, taxon, taxonomy_id, lineage) %>% nest()

#species_of_interest = unique(merged_df_nest$taxon) %>% sample(1000)
#data_nest_biome <- merged_df_nest %>% filter(taxon %in% species_of_interest)

# Do this in chunks otherwise the map() function freezes
df_biome_pt1 <- sample_abundance_nest[1:5000,] %>%
mutate(biome_presence = future_map(data, assign_biome_presence))

df_biome_pt2 <- sample_abundance_nest[5001:10000,] %>%
mutate(biome_presence = future_map(data, assign_biome_presence))

df_biome_pt3 <- sample_abundance_nest[10001:nrow(sample_abundance_nest),] %>%
mutate(biome_presence = future_map(data, assign_biome_presence))

# Combine and unlist
df_biome = data.table::rbindlist(list(df_biome_pt1, df_biome_pt2, df_biome_pt3)) %>%
dplyr::select(-data) %>%
unnest(cols = c(biome_presence)) %>%
ungroup

# Create separate column for each biome's "present in biome" (true/false = 1/0)
df_biome_wide = df_biome %>%
dplyr::select(-c(biome_count, prevalence)) %>%
pivot_wider(names_from = nlcdClass, values_from = present_in_biome)

#nrow(df_biome_wide) # 18576 taxa
write_csv(df_biome_wide, "./intermediate_data/organism_biome_preference.csv")




table(df_biome_wide$cultivatedCrops)
df_biome %>% group_by(nlcdClass, present_in_biome) %>% tally()

head(df_biome_pt1$biome_presence)

p3 = ggplot(merged_df %>%
dplyr::filter(taxon %in% species_of_interest_carbon[1:4]),
aes(x = organicd13C, y = percentage, color=taxon)) +
geom_point(alpha=.5,
position=position_jitter(width = .01, height=0), size=2,
show.legend = F) +
geom_smooth(show.legend = F) +
facet_wrap(~taxon, scales = "free") + theme_bw(base_size = 22) +
scale_y_sqrt() + xlab("Soil organic carbon") + ylab("Microbial abundance")

36 changes: 36 additions & 0 deletions comets_shinyapp_example/03_determine_pH_temp_preference.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
library(tidyverse)
library(broom)
library(data.table)

# First run 01_merge_env_data.r

# Read in sample-level abundances with biome info
sample_abundance <- fread("./sample_abundance_data.csv")


# Function to find the peak pH/temperature value
loess_maximum <- function(x, y, tol = .Machine$double.eps^0.5){
model <- loess(y ~ x,span = 0.8)
yfit <- model$fitted
inx <- which(abs(yfit - max(yfit)) < tol)[[1]]
list(x = x[inx], y.fitted = yfit[inx], ix = inx)
}

# Get pH of maximum abundance trends per taxon
pH_max = sample_abundance %>%
group_by(taxonomy_id, taxon) %>%
filter(!is.na(soilInCaClpH)) %>%
summarize(pH_preference = loess_maximum(soilInCaClpH, percentage)[[1]]) %>%
rename("Species of interest" = taxon)

# Get temp of maximum abundance trends per taxon
temp_max = sample_abundance %>%
group_by(taxonomy_id, taxon) %>%
filter(!is.na(soilTemp)) %>%
summarize(temperature_preference = loess_maximum(soilTemp, percentage)[[1]]) %>%
rename("Species of interest" = taxon)

# Combine and save
organism_preference = full_join(temp_max, pH_max)
write_csv(organism_preference, "./intermediate_data/organism_pH_temp_preference.csv")

Loading

0 comments on commit 919ce50

Please sign in to comment.