Prepare for v5 release, switch to furrr (#786)

* Bump version * Update DESCRIPTION * use furrr for build instead or remake; change decencies accordingly
traitecoevo · Nov 16, 2023 · 915df00 · 915df00
1 parent 8fb8631
commit 915df00
Show file tree

Hide file tree

Showing 10 changed files with 90 additions and 7,416 deletions.
diff --git a/.github/workflows/check-build.yml b/.github/workflows/check-build.yml
@@ -34,13 +34,11 @@ jobs:
         run: |
           library(traits.build)
           source("R/custom_R_code.R")
-          dataset_test(dir("data"))
+          dataset_test(dir("data")[1])
         shell: Rscript {0}
 
       - name: build austraits
         run: |
-          library(traits.build)
-          source("R/custom_R_code.R")
-          remake::make()
+          source("build.R")
         shell: Rscript {0}
 
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 .remake
+remake.yml
 export
 .Rproj.user
 .DS_Store
@@ -14,7 +15,6 @@ temp
 .config/
 .vs/
 man/*
-*.Rproj
 tmp*
 reports
 data_*.csv

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -11,27 +11,20 @@ Authors@R: c(
     person("ARDC", role = c("fnd")),
     person("ARC", role = c("fnd"))
     )
-Description: This package enbales harmonising of data from diverse sources. The code was originally built to support AusTraits,  an open-source compilation of data on the traits of Australian plant species. For more information on AusTraits go to https://austraits.org.
+Description: This compendium compiles the AusTraits database, an open-source compilation of data on the traits of Australian plant species (see Falster et al 2021, <doi:10.1038/s41597-021-01006-6>). For more information on AusTraits go to https://austraits.org.
 BugReports: https://github.com/traitecoevo/austraits.build/issues
 URL: http://traitecoevo.github.io/austraits.build/
 License: BSD_2_clause + file LICENCE
 Depends:
     R (>= 4.2.0),
     base,
-    traits.build,
+    traits.build (>= 1.0.0),
     dplyr,
     lubridate,
-    stringr
-Imports:
-    crayon,
-    git2r,
-    remake
-Suggests:
-    austraits,
+    stringr,
+    furrr
 Remotes:
-    traitecoevo/traits.build@develop,
-    traitecoevo/austraits@develop,
-    richfitz/remake
+    traitecoevo/[email protected]
 Encoding: UTF-8
 VignetteBuilder: knitr
 RoxygenNote: 7.2.3

diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ To handle the harmonising of diverse data sources, we use a reproducible workflo
 
 ![](inst/figures/Workflow.png)
 
-We use the [`traits.build`](https://traitecoevo.github.io/traits.build/)  R package and workflow to harmonise  > 300 different sources into a unified dataset. The workflow is fully-reproducible and open, meaning it exposes the decisions made in the processing of data into a harmonised and curated dataset and can also be rerun by others. AusTraits is built so that the database can be rebuilt from its parts at any time. This means that decisions made along the way (in how data is transformed or encoded) can be inspected and modified, and new data can be easily incorporated.
+We use the [`traits.build`](https://traitecoevo.github.io/traits.build/)  R package and workflow to harmonise > 300 different sources into a unified dataset. The workflow is fully-reproducible and open, meaning it exposes the decisions made in the processing of data into a harmonised and curated dataset and can also be rerun by others. AusTraits is built so that the database can be rebuilt from its parts at any time. This means that decisions made along the way (in how data is transformed or encoded) can be inspected and modified, and new data can be easily incorporated.
 
 To build the database follows these steps
 
@@ -54,23 +54,24 @@ remotes::install_github("traitecoevo/traits.build", quick = TRUE)
 ```
 ***Clone repository***
 
-Next you need to download a copy of this repository from Github. Then open the Rstudio project, or open R into the right repo directory.
+Next you need to download a copy of this repository from GitHub. Then open the Rstudio project, or open R into the right repo directory.
 
-***Compile via `remake`***
+***Build***
 
-One of the packages that will be installed with the `traits.build` is [`remake`](https://github.com/richfitz/remake). This package manages the compiling, and also helps streamline the amount of recompiling needed when new sources are added.
+Building the database should then be as easy as running the code in the file `build.R`. Note this code can use multiple CPUs, to do this, change the number of workers to > 1.
 
-Running the following command will rebuild AusTraits and save the assembled database into an RDS file located in `export/data/curr/austraits.rds`.
-
-```{r, eval=FALSE, echo=TRUE}
-remake::make()
-austraits <- readRDS("export/data/curr/austraits.rds")
+```
+source("build.R")
 ```
 
-Remake can also load the compiled dataset directly into R by calling:
+After running, you should have an object `austraits` available in your workspace, as well as a version saved in `export/data`.
 
-```{r, eval=FALSE, echo=TRUE}
-austraits <- remake::make("austraits")
+## Updating the build script
+
+To update the build process
+
+```
+traits.build::build_setup_pipeline(method="furrr", database_name = "austraits", workers = 1)
 ```
 
 ## Contributing to AusTraits

diff --git a/austraits.build.Rproj b/austraits.build.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
diff --git a/build.R b/build.R
@@ -0,0 +1,54 @@
+# This file is automatically generated from traits.build
+# package, via the file remake.yml.whisker:
+# edit the file there (or the files that it includes).
+
+# Load R resources
+library(traits.build)
+library(furrr)
+
+# Load data resources
+schema <- get_schema()
+resource_metadata <- get_schema("config/metadata.yml", "metadata")
+definitions <- get_schema("config/traits.yml", "traits")
+unit_conversions <- get_unit_conversions("config/unit_conversions.csv")
+taxon_list <- read_csv_char("config/taxon_list.csv")
+
+# Build sources
+dataset_ids <- c('ABRS_1981', 'ABRS_2022', 'ABRS_2023', 'Ahrens_2019', 'ANBG_2019', 'Angevin_2011', 'Apgaua_2015', 'Apgaua_2017', 'Atkinson_2020', 'Atkinson_2020_2', 'ATRP_2020', 'Auld_2000', 'Baker_2019', 'Bean_1997', 'Blackman_2010', 'Bloomfield_2018', 'Bradshaw_2022', 'Bragg_2002', 'BRAIN_2007', 'Briggs_2010', 'Britton_1994', 'Brock_1993', 'Brodribb_2000', 'Buckton_2019', 'Burrows_2001', 'Burrows_2008', 'Burrows_2020', 'Caldwell_2016', 'Campbell_2006', 'Canham_2009', 'Canham_2023', 'Catford_2014', 'Cernusak_2006', 'Cernusak_2011', 'Chandler_2002', 'Cheal_2017', 'Cheesman_2020', 'Chen_2017', 'Chinnock_2007', 'Choat_2006', 'Choat_2012', 'Clarke_2009', 'Clarke_2015', 'Collette_2021', 'Cooper_2004', 'Cooper_2013', 'Cowling_1987', 'CPBR_2002', 'Craven_1987', 'Craven_2010', 'Crisp_2017', 'Cross_2009', 'Crous_2013', 'Crous_2019', 'Crowley_2007', 'Cunningham_1999', 'Curran_2009', 'Curtis_2012', 'deCampos_2013', 'Denton_2007', 'Detombeur_2021', 'Dong_2017', 'Draper_2023', 'Du_2018', 'Du_2019', 'Duan_2015', 'Duncan_1998', 'Dwyer_2017', 'Dwyer_2018', 'Eamus_1998', 'Eamus_1999', 'Eamus_2000', 'Edwards_2000', 'eFLOWER_2021', 'eFLOWER_Dun_2022', 'Ellsworth_2015', 'Enright_2014', 'EsperonRodriguez_2019', 'EsperonRodriguez_2020', 'Everingham_2020', 'Falster_2003', 'Falster_2005_1', 'Falster_2005_2', 'Farrell_2012', 'Farrell_2013', 'Farrell_2017', 'Firn_2019', 'Fonseca_2000', 'Forster_1992', 'Forster_1995', 'French_2017', 'Funk_2016', 'Gallagher_2011_1', 'Gallagher_2011_3', 'Gallagher_2012', 'Gallagher_2015', 'Gallagher_2018', 'Gardiner_2019', 'Geange_2017', 'Geange_2020', 'Ghannoum_2010', 'Goble_1981', 'Gosper_2004', 'Gosper_2012', 'Gosper_2018', 'Gosper_2022', 'GrassBase_2014', 'Gray_2019', 'Grigg_2008', 'Groom_1997', 'Groom_2010', 'Grootemaat_2015', 'Grootemaat_2017_1', 'Grootemaat_2017_2', 'Gross_1993', 'Gross_2005', 'Groves_1986', 'Grubb_1996', 'Grubb_2008', 'Guilherme_Pereira_2018', 'Guilherme_Pereira_2019', 'Hall_1981', 'Harrison_2009', 'Harvey_2017', 'Hassiotou_2009', 'Hayes_2014', 'Hayes_2018', 'He_2011', 'Henery_2001', 'Hocking_1982', 'Hocking_1986', 'Huang_2015', 'Hughes_1992', 'Hughes_2005', 'Hyland_2003', 'Ilic_2000', 'Islam_1999_1', 'Islam_1999_2', 'Jagdish_2020', 'Jin_2019', 'Jordan_2001', 'Jordan_2007', 'Jordan_2015', 'Jordan_2020', 'Jurado_1991', 'Jurado_1992', 'Kanowski_2000', 'Keighery_2004', 'Kew_2019_1', 'Kew_2019_2', 'Kew_2019_3', 'Kew_2019_4', 'Kew_2019_5', 'Kew_2019_6', 'Kirkpatrick_2020', 'Knox_2011', 'Kocacinar_2003', 'Kooyman_2011', 'Kotowska_2020', 'Kubiak_2009', 'Kuo_1982', 'Laliberte_2012', 'Lamont_2002', 'Lawes_2012', 'Lawes_2014', 'Lawson_2015', 'Laxton_2005', 'Lee_2019', 'Leigh_2003', 'Leigh_2006', 'Leishman_1992', 'Leishman_1993', 'Leishman_1995', 'Leishman_2007', 'Lemmens_1994', 'Lewis_2015', 'Lim_2017', 'Lord_1997', 'Lunt_2012', 'Lusk_2010', 'Lusk_2012', 'Lusk_2014', 'MacinnisNg_2004', 'MacinnisNg_2016', 'Manea_2011', 'Maslin_2012', 'McCarthy_2017', 'McGlone_2015', 'Meers_2007', 'Mesaglio_2022', 'Metcalfe_2009', 'Metcalfe_2020_1', 'Metcalfe_2020_2', 'Milberg_1997', 'Milberg_1998', 'Mitchell_2008', 'Mokany_2008', 'Mokany_2015', 'Moles_2000', 'Moles_2003', 'Moles_2011', 'Moore_2019', 'Moore_2019_2', 'Morgan_2005', 'Morgan_2011_1', 'Morgan_2011_2', 'Morgan_2014', 'Morgan_2021', 'Muir_2014', 'Munroe_2019', 'Nano_2011', 'NHNSW_2014', 'NHNSW_2014_2', 'NHNSW_2016', 'NHNSW_2022', 'NHNSW_2023', 'Nicholson_2017', 'Nicolle_2006', 'Niinemets_2009', 'Nolan_2022', 'NSWFRD_2014', 'NTH_2014', 'NTH_2022', 'NTH_2023', 'Onoda_2010', 'Ooi_2007', 'Ooi_2018', 'OReillyNugent_2018', 'Osborne_2014', 'Pate_1990', 'Pate_1998', 'Peeters_2002', 'Pekin_2011', 'Pfautsch_2016', 'Pickering_2014', 'Pickup_2002', 'Pickup_2005', 'Pirralho_2014', 'Pollock_2012', 'Pollock_2018', 'Prior_2003', 'Prior_2016', 'Prior_2022', 'Purdie_1976', 'RBGK_2014', 'RBGV_2022', 'RBGV_2023', 'Read_2003', 'Read_2005', 'Reynolds_2018', 'Rice_1991', 'Richards_2003', 'Richards_2008', 'Richards_2021', 'Roberts_2006', 'Roderick_1999', 'Roderick_2002', 'Rosell_2014', 'Rumman_2018', 'RussellSmith_2012', 'Rye_2002', 'Rye_2006', 'Rye_2009_1', 'Rye_2009_2', 'Rye_2013_1', 'Rye_2013_2', 'Rye_2015', 'SAH_2014', 'SAH_2022', 'SAH_2023', 'Sams_2017', 'Santini_2012', 'Santini_2013', 'Santini_2016', 'Schmidt_1993', 'Schmidt_1997', 'Schmidt_2003', 'Schmidt_2010', 'Schulze_1998', 'Schulze_2006', 'Schulze_2006_2', 'Schulze_2014', 'Scott_2010', 'Searson_2004', 'Sendall_2016', 'Simpson_2021', 'SinghRamesh_2019', 'SinghRamesh_2023', 'Sjostrom_2006', 'Smith_1996', 'Smith_2012', 'SmithMartin_2020', 'Soliveres_2012', 'Soper_2014', 'Standish_2019', 'Staples_2019', 'Stephens_2021', 'Stephens_2023', 'Stewart_1995', 'Sweedman_2006', 'Taseski_2017', 'Taylor_2008', 'Thomas_2017', 'Thompson_2001', 'TMAG_2009', 'Tng_2013', 'Toelken_1996', 'Togashi_2015', 'Tolsma_2007', 'Tomlinson_2013', 'Tomlinson_2019', 'Trudgen_2005', 'Trudgen_2014', 'Tsakalos_2020', 'Tsakalos_2022', 'Turner_2010', 'vanderMoezel_1987', 'Veneklaas_2003', 'Venn_2011', 'Vesk_2004', 'Vesk_2007', 'Vesk_2019', 'Vlasveld_2018', 'WAH_1998', 'WAH_2016', 'WAH_2022_1', 'WAH_2022_2', 'WAH_2023_1', 'WAH_2023_2', 'Warren_2005', 'Warren_2006', 'Weerasinghe_2014', 'Wells_2012', 'Wenk_2022', 'Wenk_2023', 'Wenk_2023_2', 'Westman_1977', 'Westoby_2003', 'Westoby_2004', 'Westoby_2014', 'Wheeler_2002', 'White_2020', 'Williams_2005', 'Williams_2011', 'Williams_2012', 'Wills_2018', 'Wilson_2004', 'Wilson_2008', 'Witkowski_1991', 'Wooller_2002', 'Wright_2000', 'Wright_2001', 'Wright_2002', 'Wright_2006', 'Wright_2008', 'Wright_2009', 'Wright_2019', 'Yang_2023', 'Yunusa_2010', 'Zanne_2007', 'Zanne_2009', 'Zieminska_2013', 'Zieminska_2015')
+
+f <- function(dataset_id) {
+  file_metadata <- file.path("data", dataset_id, "metadata.yml")
+  file_data <- file.path("data", dataset_id, "data.csv")
+
+  dataset_build(file_metadata, file_data,
+                definitions = definitions,
+                unit_conversion_functions = unit_conversions,
+                schema = schema,
+                resource_metadata = resource_metadata,
+                taxon_list = taxon_list)
+}
+
+#sources <- purrr::map(dataset_ids, f)
+
+plan(multisession, workers = 1)
+#plan(sequential)
+
+sources <- future_map(dataset_ids, f, .progress = TRUE)
+
+austraits_raw <- build_combine(d = sources)
+
+# Version information
+version_number <- util_get_version("config/metadata.yml")
+git_SHA <- util_get_SHA()
+
+# Combine all the source into one resource
+austraits <- build_add_version(austraits_raw, version_number, git_SHA)
+
+# Save to file
+dir.create("export/data/curr", FALSE, TRUE)
+saveRDS(austraits, "export/data/curr/austraits.rds")
+
+# Save version value to file
+dir.create("export/data/curr", FALSE, TRUE)
+saveRDS(austraits, sprintf("export/data/curr/austraits-%s.rds", version_number))
+
diff --git a/config/metadata.yml b/config/metadata.yml
@@ -1,7 +1,7 @@
 metadata:
   title: 'AusTraits: a curated plant trait database for the Australian flora'
   description: "AusTraits is a transformative database, containing measurements on the traits of Australia's plant taxa, standardised from hundreds of disconnected primary sources. While this repository contains the harmonised data, the raw data and code used to build the resource are also available on the project's GitHub repository,http://traitecoevo.github.io/austraits.build. Further information on the project is available in the associated publication and at the project website https://austraits.org."
-  version: "4.2.0.9000"
+  version: "5.0.0"
   doi: 10.5281/zenodo.3568417
   geo_location:
     geo_location_place: Australia

diff --git a/inst/support/remake.yml.whisker b/inst/support/remake.yml.whisker