minor edits to readme and tables

afredston · May 5, 2023 · 892258b · 892258b
1 parent bfc0992
commit 892258b
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 18 deletions.
diff --git a/MHW_stats_and_figures_supplement.Rmd b/MHW_stats_and_figures_supplement.Rmd
@@ -251,7 +251,7 @@ lm.fixed.wt <- lm(wt_mt_log_scale ~ anom_sev_scale + factor(survey), data = mode
 gam.wt <- gam(wt_mt_log_scale ~ s(anom_sev_scale), data = modeldat )
 gam.re.wt <- gam(wt_mt_log_scale ~ s(anom_sev_scale) + s(survey, bs="re"), data = modeldat  %>% mutate(survey = as.factor(survey)))
 
-formulas.wt <- c('Biomass LR* $\\sim$ 1','Biomass LR* $\\sim$ MHW CInt*','Biomass LR* $\\sim$ MHW CInt* + Survey[fixed]','Biomass LR* $\\sim$ s(MHW CInt*)','Biomass LR* $\\sim$ s(MHW CInt*) + s(Survey[random])')
+formulas.wt <- c('Biomass LR* $\\sim$ 1','Biomass LR* $\\sim$ MHW CInt*','Biomass LR* $\\sim$ MHW CInt* + Survey[fixed]','Biomass LR* $\\sim$ s(MHW CInt*)','Biomass LR* $\\sim$ s(MHW CInt*) + Survey[random]')
 # all of the ugly code below pulls out the correct table values from the models and formats them nicely so they don't have too many digits printed 
 int.wt <- c(
   paste0(format(round(coef(summary(null.wt))[1,1], digits=3), nsmall=2)," ± ", format(round(coef(summary(null.wt))[1,2], digits=3), nsmall=2)),
@@ -276,8 +276,9 @@ p.wt <- c(
   format(round(summary(gam.re.wt)$s.pv[1], digits=3), nsmall=2)
 )
 
-r2.wt <- c("NA",
+r2.wt <- c(
            format(round(c(
+             summary(null.wt)$r.squared,
              summary(lm.wt)$r.squared, 
              summary(lm.fixed.wt)$r.squared,
              summary(gam.wt)$r.sq, 
@@ -357,7 +358,7 @@ lm.fixed.abs <- lm(wt_mt_log_scale_abs ~ anom_sev_scale + factor(survey), data =
 gam.abs <- gam(wt_mt_log_scale_abs ~ s(anom_sev_scale), data = modeldat )
 gam.re.abs <- gam(wt_mt_log_scale_abs ~ s(anom_sev_scale) + s(survey, bs="re"), data = modeldat  %>% mutate(survey = as.factor(survey)))
 
-formulas.abs <- c('Absolute Biomass LR* $\\sim$ 1','Absolute Biomass LR* $\\sim$ MHW CInt*','Absolute Biomass LR* $\\sim$ MHW CInt* + Survey[fixed]','Absolute Biomass LR* $\\sim$ s(MHW CInt*)','Absolute Biomass LR* $\\sim$ s(MHW CInt*) + s(Survey[random])')
+formulas.abs <- c('Absolute Biomass LR* $\\sim$ 1','Absolute Biomass LR* $\\sim$ MHW CInt*','Absolute Biomass LR* $\\sim$ MHW CInt* + Survey[fixed]','Absolute Biomass LR* $\\sim$ s(MHW CInt*)','Absolute Biomass LR* $\\sim$ s(MHW CInt*) + Survey[random]')
 # all of the ugly code below pulls out the correct table values from the models and formats them nicely so they don't have too many digits printed 
 int.abs <- c(
   paste0(format(round(coef(summary(null.abs))[1,1], digits=3), nsmall=2)," ± ", format(round(coef(summary(null.abs))[1,2], digits=3), nsmall=2)),
@@ -382,8 +383,9 @@ p.abs <- c(
   format(round(summary(gam.re.abs)$s.pv[1], digits=3), nsmall=2)
 )
 
-r2.abs <- c("NA",
+r2.abs <- c(
             format(round(c(
+              summary(null.abs)$r.squared,
               summary(lm.abs)$r.squared, 
               summary(lm.fixed.abs)$r.squared,
               summary(gam.abs)$r.sq, 
@@ -427,7 +429,7 @@ lm.fixed.altmhw <- lm(wt_mt_log_scale ~ anom_sev_scale_alt + factor(survey), dat
 gam.altmhw <- gam(wt_mt_log_scale ~ s(anom_sev_scale_alt), data = modeldat )
 gam.re.altmhw <- gam(wt_mt_log_scale ~ s(anom_sev_scale_alt) + s(survey, bs="re"), data = modeldat  %>% mutate(survey = as.factor(survey)))
 
-formulas.altmhw <- c('Biomass LR* $\\sim$ 1','Biomass LR* $\\sim$ MHW CInt**','Biomass LR* $\\sim$ MHW CInt** + Survey[fixed]','Biomass LR* $\\sim$ s(MHW CInt**)','Biomass LR* $\\sim$ s(MHW CInt**) + s(Survey[random])')
+formulas.altmhw <- c('Biomass LR* $\\sim$ 1','Biomass LR* $\\sim$ MHW CInt**','Biomass LR* $\\sim$ MHW CInt** + Survey[fixed]','Biomass LR* $\\sim$ s(MHW CInt**)','Biomass LR* $\\sim$ s(MHW CInt**) + Survey[random]')
 
 int.altmhw <- c(
   paste0(format(round(coef(summary(null.altmhw))[1,1], digits=3), nsmall=2)," ± ", format(round(coef(summary(null.altmhw))[1,2], digits=3), nsmall=2)),
@@ -451,7 +453,8 @@ p.altmhw <- c(
   format(round(summary(gam.altmhw)$s.pv, digits=3), nsmall=2),
   format(round(summary(gam.re.altmhw)$s.pv[1], digits=3), nsmall=2)
 )
-r2.altmhw <- c('NA',format(round(c(
+r2.altmhw <- c(format(round(c(
+    summary(null.altmhw)$r.squared, 
   summary(lm.altmhw)$r.squared, 
   summary(lm.fixed.altmhw)$r.squared,
   summary(gam.altmhw)$r.sq, 
@@ -550,7 +553,8 @@ coef.gompertz <- c('NA',paste0(format(round(gompertz.summary["anom_sev_scale","E
 
 p.gompertz <- c('NA',paste0(format(round(gompertz.summary["anom_sev_scale","Pr(>|z|)"] , digits=3), nsmall=2)))
 
-r2.gompertz <- c('NA',format(round(unname(performance::r2_xu(gompertz.glmm)), digits=3), nsmall=2)) # see performance() documentation -- xu is one of the simpler and more flexible R2 metrics
+r2.gompertz <- c(format(round(unname(performance::r2_xu(gompertz.null)), digits=3), nsmall=2),
+                 format(round(unname(performance::r2_xu(gompertz.glmm)), digits=3), nsmall=2)) # see performance() documentation -- xu is one of the simpler and more flexible R2 metrics
 
 aic.gompertz <- round(c(AIC(gompertz.null), AIC(gompertz.glmm)))
 
@@ -643,7 +647,10 @@ coef.depth <- c('NA',
 )
 p.depth <- c('NA',format(round(c(
   coef(summary(lm.depth))[2,4]), digits=3), nsmall=2))
-r2.depth <- c('NA',format(round(c(
+r2.depth <- c(
+  format(round(c(
+  summary(null.depth)$r.squared), digits=3), nsmall=2),
+  format(round(c(
   summary(lm.depth)$r.squared), digits=3), nsmall=2))
 aic.depth <- round(c(AIC(null.depth), AIC(lm.depth)))
 df.depth <- round(c(
@@ -730,7 +737,7 @@ gam.cti <- gam(cti_diff_scale ~ s(anom_sev_scale), data = modeldat )
 gam.re.cti <- gam(cti_diff_scale ~ s(anom_sev_scale) + s(survey, bs="re"), data = modeldat  %>% mutate(survey = as.factor(survey)))
 
 
-formulas.cti <- c('CTI Diff* $\\sim$ 1','CTI Diff* $\\sim$ MHW CInt*','CTI Diff* $\\sim$ MHW CInt* + Survey[fixed]','CTI Diff* $\\sim$ s(MHW CInt*)','CTI Diff* $\\sim$ s(MHW CInt*) + s(Survey[random])')
+formulas.cti <- c('CTI Diff* $\\sim$ 1','CTI Diff* $\\sim$ MHW CInt*','CTI Diff* $\\sim$ MHW CInt* + Survey[fixed]','CTI Diff* $\\sim$ s(MHW CInt*)','CTI Diff* $\\sim$ s(MHW CInt*) + Survey[random]')
 
 int.cti <- c(
   paste0(format(round(coef(summary(null.cti))[1,1], digits=3), nsmall=2)," ± ", format(round(coef(summary(null.cti))[1,2], digits=3), nsmall=2)),
@@ -753,8 +760,9 @@ p.cti <- c('NA',
            format(round(summary(gam.cti)$s.pv, digits=3), nsmall=2),
            format(round(summary(gam.re.cti)$s.pv[1], digits=3), nsmall=2)
 )
-r2.cti <- c('NA',format(round(c(
-  summary(lm.cti)$r.squared, 
+r2.cti <- c(format(round(c(
+    summary(null.cti)$r.squared, 
+summary(lm.cti)$r.squared, 
   summary(lm.fixed.cti)$r.squared,
   summary(gam.cti)$r.sq, 
   summary(gam.re.cti)$r.sq

diff --git a/MHW_stats_and_figures_supplement.pdf b/MHW_stats_and_figures_supplement.pdf
diff --git a/README.md b/README.md
@@ -2,11 +2,11 @@
 
 *Alexa L. Fredston, William W. L. Cheung, Thomas L. Frölicher, Zoë J. Kitchel, Aurore A. Maureaud, James T. Thorson, Arnaud Auber, Bastien Mérigot, Juliano Palacios-Abrantes, Maria Lourdes D. Palomares, Laurène Pecuchet, Nancy Shackell, Malin L. Pinsky*
 
-Please contact [A. Fredston](https://www.alexafredston.com/) with questions about this project. A DOI will be associated with this repository when it is published. Please cite the manuscript for all references to this project and its results, as well as the Zenodo DOI if data or code are reused. 
+Please contact [A. Fredston](https://www.alexafredston.com/) with questions about this project. Please cite the manuscript for all references to this project and its results, as well as the OSF DOI if data or code are reused. The results are fully reproducible from the scripts and all data files are either in this GitHub repository or (if they are too large) hosted on [OSF](https://osf.io/) (as described below). 
 
 ## Where do the raw data come from?
 
-We used of a number of datasets that are already publicly available and/or published with this project. These are fully described and cited in the manuscript, but we also list them here for ease of download, access, and attribution. The raw data files are also available on OSF. 
+We used of a number of datasets that are already publicly available and/or published with this project. These are fully described and cited in the manuscript, but we also list them here for ease of download, access, and attribution. The raw data files are also available on [OSF](https://osf.io/). 
 
 * Trawl data from [FISHGLOB](https://github.com/AquaAuma/FishGlob_data), a project to harmonize publicly available trawl survey records from federal agencies around the globe. 
 * Sea bottom temperature data from [GLORYS](https://www.mercator-ocean.eu/en/ocean-science/glorys/), an ocean reanalysis data product by the European Copernicus Marine Environment Modeling Service available beginning in 1993. 
@@ -15,27 +15,35 @@ We used of a number of datasets that are already publicly available and/or publi
 * Species-specific realized thermal niche estimates from [Burrows et al. 2018](https://figshare.com/articles/dataset/Species_Temperature_Index_and_thermal_range_information_forNorth_Pacific_and_North_Atlantic_plankton_and_bottom_trawl_species/6855203/1).
 * Species traits from [Beukhof et al. 2019](https://doi.org/10.5061/dryad.ttdz08kt8).
 
-## What's in this repository, and in what order should things be run?
+## What's in this repository?
 
 The repository is organized as follows:
 
-* `raw-data` contains raw datasets obtained from other sources (described above) that have not been changed in any way. If these are too large to host on GitHub they will be linked from here when published along with this manuscript. 
+* `raw-data` contains raw datasets obtained from other sources (described above) that have not been changed in any way. 
 * `processed-data` contains outputs of scripts that filter, clean, summarize, or analyze data.
 * `figures` contains image files generated by scripts for the main text figures.
 * `extended` contains image files generated by scripts for the extended data figures. 
 
+## What's not in the repository? 
+
+The following files are too big to host on GitHub and are available on [OSF](https://osf.io/) only:
+
+- The raw FISHGLOB data (a slightly earlier version of the dataset described in [Maureaud et al. 2023](https://doi.org/10.31219/osf.io/2bcjw)), `FISHGLOB_public_v1.5_clean(1).csv`. This should be downloaded from [OSF](https://osf.io/) and moved to `raw-data` to reproduce analyses.
+- The outputs of power analyses: `pwrout_yrs_glorys.rds`, `pwrout_yrs_oisst.rds`, `pwrout_gamma_glorys.rds`, and `pwrout_gamma_oisst.rds`. These should be downloaded from [OSF](https://osf.io/) and moved to `processed-data` to reproduce analyses.
+
+## In what order should things be run?
+
 To reproduce the analysis, run these scripts in order:
 
 1. `prep_survey_extent_nc.R` generates a netcdf file with the spatial footprints of trawl surveys used in the analysis, for merging with marine heatwave data. It also generates a .csv file with a "key" to translate the integer survey ID codes in the netcdf to their real names. This does not need to be re-run, but all the scripts below this should be re-run in order if the spatial footprints are updated.
-1. `prep_trawl_data.R` takes the public trawl data records from [FISHGLOB](https://github.com/AquaAuma/FishGlob_data), processes them into a standardized format with columns for survey, year, species, and CPUE in kg, and writes them out as .csv files in the `processed-data` folder. This was run on a computing server (see Notes below).
+1. `prep_trawl_data.R` takes the public trawl data records from version 1.5 of [FISHGLOB](https://github.com/AquaAuma/FishGlob_data) (see above note about downloading the data file from [OSF](https://osf.io/)), processes them into a standardized format with columns for survey, year, species, and CPUE in kg, and writes them out as .csv files in the `processed-data` folder. This was run on a computing server (see Notes below). If you skip this step, the rest of the code will still run, using processed data files hosted on GitHub. 
 1. `prep_MHWs.R` merges those trawl records with the appropriate MHW and CTI data.
 1. `temporal_beta_diversity.R` calculates interannual changes in richness and beta diversity (both total dissimilarity and individual components of nestedness and turnover for occurrence metrics and biomass gradient and balanced variation for biomass metrics).
-1. `power_analysis.R` conducts a power analysis. This was run on a computing server (see Notes below).
+1. `power_analysis.R` conducts a power analysis. This was run on a computing server (see Notes below) and if you download the output files from [OSF](https://osf.io/) (see above) you can skip this step.
 1. `MHW_stats_and_figures_main_text.R` analyzes the data and makes figures and statistics for the main text. 
 1. `MHW_stats_and_figures_supplement.Rmd` generates the SI tables as a PDF and writes out all the extended data figures. 
 
 ## Notes
 
-* The `raw-data` folder is not version controlled due to its size. It contains all of the data products that were used in this project. They are not technically raw data (as some pre-processing happened before they were pulled into this project), but not processed in this repository. The entire `raw-data` file is available on OSF. 
 * Software versions used in this analysis are captured by `renv` and listed [in the lockfile](https://github.com/afredston/marine_heatwaves_trawl/blob/main/renv.lock). Here is [a summary of how renv works](https://rstudio.github.io/renv/articles/renv.html). 
 * Most of these analyses were run on a personal computer, and should be easy to reproduce, except `prep_trawl_data.R` and `power_analysis.R`, which were conducted in R 3.6.0 on a machine with the following specifications: Windows Server 2019 2-Intel 6154 Xeon 3.0Ghz 18cores(36 threads) each 512 GB memory 6TB- SSD storage 18TB- HDD storage NVIDIA P4000 8Gb 10Gb ethernet