From 09d4e55364de2fdec4cb1c501c6645c5b7e2161e Mon Sep 17 00:00:00 2001 From: Fonti Kar Date: Tue, 22 Oct 2024 08:49:37 +1100 Subject: [PATCH] Workflow of adapting infinitylists for Global use via GBIF Global node (#100) * Updated file naming * Removed Australia prefix in parquet names * Started skeleton of vignette * Added spiel about finding matching fields * Comments about Establishment means, outlining spatial part of the workflow and discussion about filtering in process_data() * Minor edits and seeing through establishmeans * Edits * Code to translate for gbif_global * Processing GBIF code * Sorting out date myster * Added gbif process code * Commented out downloads * Moved gbif code into its own script * Fixed mapping for large downloads * Added skeleton documentation * Fixed naming of Establishment Means and saving Link in parquet for GBIF * Added wrapper to download gbif obs but still buggy with dates * Added documentation and suprressed warnings * Updated documentation * Updated doc * Updated Global vars, updated vignette worfklow and tests * Updated README * Removed name bits that were not working #85 --- .Rbuildignore | 6 +- .gitignore | 3 + DESCRIPTION | 9 +- NAMESPACE | 2 + R/galah_download.R | 9 +- R/gbif_download.R | 209 ++++++++++++++++++++++++ R/infinitylists-package.R | 11 +- R/server.R | 12 +- R/ui.R | 3 +- README.Rmd | 7 +- README.md | 10 ++ man/download_gbif_obs.Rd | 34 ++++ man/query_gbif_global.Rd | 20 +++ tests/testthat/_snaps/galah_download.md | 2 +- vignettes/.gitignore | 2 + vignettes/diy.Rmd | 138 ++++++++++++++++ vignettes/diy.Rmd.orig | 130 +++++++++++++++ 17 files changed, 586 insertions(+), 21 deletions(-) create mode 100644 R/gbif_download.R create mode 100644 man/download_gbif_obs.Rd create mode 100644 man/query_gbif_global.Rd create mode 100644 vignettes/.gitignore create mode 100644 vignettes/diy.Rmd create mode 100644 vignettes/diy.Rmd.orig diff --git a/.Rbuildignore b/.Rbuildignore index da446b9..3d4f842 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -8,7 +8,11 @@ ^\.github$ ^codecov\.yml$ ^inst/data$ -ignore/ +^ignore$ + + +^doc$ +^Meta$ diff --git a/.gitignore b/.gitignore index a66dfe0..6160563 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ .DS_Store ignore/ inst/data +inst/doc +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index 5cf7894..d0aa683 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,11 +32,16 @@ Imports: shiny, shinybusy, stringr, - shinythemes + shinythemes, + tidyr Remotes: traitecoevo/APCalign Suggests: - job, bsplus, + here, + job, + knitr, + rmarkdown, testthat (>= 3.0.0) Config/testthat/edition: 3 +VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 448252b..b688c06 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,8 @@ # Generated by roxygen2: do not edit by hand export(download_ala_obs) +export(download_gbif_obs) export(infinitylistApp) +export(query_gbif_global) import(data.table) import(shiny) diff --git a/R/galah_download.R b/R/galah_download.R index a2d78d3..149c82e 100644 --- a/R/galah_download.R +++ b/R/galah_download.R @@ -114,7 +114,7 @@ retrieve_data_by_years <- function(taxon, download, paste0( output_dir, - "ALA-Australia-", + "Living-Atlas", taxon, "-", first(years), @@ -189,12 +189,12 @@ get_establishment_status <- function(ala_cleaned, taxon = taxon) { ala_cleaned$native_anywhere_in_aus <- "unknown" } # Rename native_anywhere_in_aus - ala_cleaned <- dplyr::rename(ala_cleaned,"Establishment means" = native_anywhere_in_aus) + ala_cleaned <- dplyr::rename(ala_cleaned,"Establishment Means" = native_anywhere_in_aus) return(ala_cleaned) } - +#' Process downloaded data from Atlas of Living Australia #' @noRd process_data <- function(data) { datasets_of_interest <- c( @@ -244,7 +244,6 @@ process_data <- function(data) { janitor::clean_names("title") } - #' @noRd save_data <- function(data, taxon, output_dir) { if (!file.exists(file.path(output_dir))) { @@ -255,7 +254,7 @@ save_data <- function(data, taxon, output_dir) { arrow::write_parquet(x = data, sink = file.path( output_dir, - paste0("Australia-", + paste0("Living-Atlas-", taxon, "-", Sys.Date(), diff --git a/R/gbif_download.R b/R/gbif_download.R new file mode 100644 index 0000000..a90a8c2 --- /dev/null +++ b/R/gbif_download.R @@ -0,0 +1,209 @@ +#' Download and Process GBIF Observations +#' +#' This function retrieves, processes, and saves GBIF (Global Biodiversity Information Facility) observation data for a specified taxon. +#' +#' @param taxon Character. The taxon (species, genus, etc.) for which to retrieve GBIF data. +#' @param min_year Numeric. The minimum year for the observations to be retrieved. Default is 1923. +#' @param max_year Numeric. The maximum year for the observations to be retrieved. Default is the current year. +#' @param country_code Character. The ISO 3166-1 alpha-2 country code to filter observations by country. Default is NULL (no country filter). +#' @param save_raw_data Logical. Whether to save the raw data retrieved from GBIF. Default is FALSE. +#' @param output_dir Character. The directory where the processed data will be saved. Default is a "data" directory within the "infinitylists" package. +#' +#' @return None. The function saves the processed data to the specified output directory. +#' @export + +download_gbif_obs <- function(taxon, + min_year = 1923, + max_year = as.numeric(format(Sys.Date(), "%Y")), + country_code= NULL, + save_raw_data = FALSE, + output_dir = file.path(system.file(package = "infinitylists"), "data/")){ + # 1. Data retrieval + gbif_obs <- + retrieve_gbif_data(taxon, min_year, max_year, country_code, save_raw_data, output_dir) + + # 2. Filtering and processing + gbif_cleaned <- suppressWarnings(gbif_process_data(gbif_obs)) + + # 4. Save processed data + save_data(gbif_cleaned, taxon, output_dir) +} + + +#' Generate galah query for Global GBIF Node +#' +#' @param taxon character, genus/family/kingdom +#' @param min_year numeric, year cut off for query, only records where year >= min_year will be included +#' @param max_year numeric, year cut off for query, only records where year <= max_year will be included +#' @param country_code character, code for country +#' @export + +query_gbif_global<- function(taxon, + min_year, + max_year, + country_code = NULL){ + if(is.null(country_code)) + query <- galah::galah_call() |> + galah::galah_identify(taxon) |> + galah::galah_filter( + hasGeospatialIssue == "false", + year >= min_year, + year <= max_year) + else( + query <- galah::galah_call() |> + galah::galah_identify(taxon) |> + galah::galah_filter( + country == country_code, + hasGeospatialIssue == "false", + year >= min_year, + year <= max_year) + ) + return(query) +} + +#' Download GBIF data and save as output +#' @noRd +#' @keywords internal +retrieve_gbif_data_by_year_range <- function(taxon, min_year, max_year, country_code= NULL, + save_raw_data = FALSE, output_dir){ + + download <- query_gbif_global(taxon, min_year, max_year, country_code) |> + galah::atlas_occurrences() + + # Save download (optional) + if (save_raw_data) + arrow::write_parquet( + download, + paste0( + output_dir, + "GBIF-preprocessed-", + taxon, + "-", + min_year, + "-", + max_year, + "-", + Sys.Date(), + ".parquet" + ) + ) + + return(format_date_as_character(download)) +} + +#' @noRd +#' @keywords internal +format_date_as_character <- function(data){ + data |> + dplyr::mutate(eventDate = as.character(eventDate)) +} + + +#' Retrieve GBIF records +#' +#' @param taxon +#' @param min_year +#' @param max_year +#' @param country_code +#' @param save_raw_data +#' @param output_dir +#' @noRd +#' @keywords internal + +retrieve_gbif_data <- function(taxon, min_year, max_year, + country_code = NULL, + save_raw_data = FALSE, + output_dir = file.path(system.file(package = "infinitylists"), "data/") + ){ + + n_obs <- query_gbif_global(taxon, min_year, max_year, country_code) |> + galah::atlas_counts() |> + dplyr::pull(count) + + # If less than 1 mil records + if (n_obs < 1000000){ + download <- retrieve_gbif_data_by_year_range(taxon, + min_year, max_year, + country_code, + save_raw_data, + # output_dir = file.path(system.file(package = "infinitylists"), "data/") + output_dir) + } else { + + years <- seq(min_year, max_year) + length(years) + + # Split years + year_chunks <- split(years, ceiling(seq_along(years) / 10)) + + # Map this + download <- purrr::map(year_chunks, + purrr::possibly( + ~ retrieve_gbif_data_by_year_range(taxon, + min_year = range(.x)[1], max_year = range(.x)[2], + country_code, + save_raw_data, + # output_dir = file.path(system.file(package = "infinitylists"), "data/") + output_dir)) + ) |> + purrr::list_rbind() + + } + return(download) +} + +#' Process downloaded data from Atlas of Living Australia +#' @noRd +#' @keywords internal + +gbif_process_data <- function(data){ + data |> + tidyr::drop_na(decimalLatitude) |> + dplyr::filter( + basisOfRecord == "PRESERVED_SPECIMEN" | + stringr::str_detect(institutionCode, stringr::regex("inaturalist", ignore_case = TRUE)), + is.na(coordinateUncertaintyInMeters) | + coordinateUncertaintyInMeters <= 1000, + !is.na(eventDate), + !stringr::str_detect(species, "spec.$"), + !stringr::str_count(eventDate) <= 7, # Exclude strings with 7 or fewer characters, these are years or year + month e.g 2006-06 or just 2006 + !stringr::str_count(eventDate) > 16 + ) |> # Exclude strings with greater than 16 characters - a few records had date ranges e.g. 2017-12-10T00:00Z/2017-12-23T00:00Z + dplyr::mutate( + eventDate_as_date = lubridate::as_date(eventDate), # Convert to dates + eventDate_ymd = lubridate::ymd_hm(eventDate, tz = "UTC", quiet = TRUE), # Convert dates that have time zones + collectionDate = dplyr::coalesce(eventDate_as_date, eventDate_ymd) # Put the two date columns together as one complete one. + ) |> + dplyr::mutate( + repository = dplyr::case_when(grepl("inatur", occurrenceID) ~ occurrenceID, # Create Repository column, if occurrence ID contains "inatur", keep occurrenceID + TRUE ~ institutionCode), # Otherwise take institutionCode + link = dplyr::case_when(grepl("https", repository) ~ repository, # Create link + TRUE ~ paste0("https://www.gbif.org/dataset/", datasetKey) + ), + sounds = dplyr::case_when( # Logical variable to determine if there voucher_type + grepl("Sound", mediaType) ~ 1, + TRUE ~ 0 + ), + voucher_type = dplyr::case_when( + basisOfRecord == "PRESERVED_SPECIMEN" ~ "Collection", + sounds == 1 ~ "Audio", + TRUE ~ "Photograph" + ), + lat = decimalLatitude, + long = decimalLongitude, + ) |> + dplyr::select( + species, genus, family, + collectionDate, + lat, + long, + voucher_type, + repository, + recordedBy, + establishmentMeans, + link + ) |> + janitor::clean_names("title") +} + + diff --git a/R/infinitylists-package.R b/R/infinitylists-package.R index 4f579b0..d03c503 100644 --- a/R/infinitylists-package.R +++ b/R/infinitylists-package.R @@ -60,7 +60,14 @@ utils::globalVariables( "write.csv", "Link", "Repository", - "Establishment means", - "repository" + "Establishment Means", + "repository", + "eventDate_as_date", + "eventDate_ymd", + "establishmentMeans", + "hasGeospatialIssue", + "link", + "country", + "count" ) ) \ No newline at end of file diff --git a/R/server.R b/R/server.R index d9bcd32..e97f8ef 100644 --- a/R/server.R +++ b/R/server.R @@ -200,7 +200,7 @@ infinity_server <- function(...) { total_family <- length(unique(data$Family)) native <- - dplyr::filter(data, `Establishment means` == "native") + dplyr::filter(data, `Establishment Means` == "native") if (nrow(native) > 0) total_native_species <- length(unique(native$Species)) else @@ -235,7 +235,7 @@ infinity_server <- function(...) { total_species, " species observed, ", total_native_species, - " are considered native to Australia." + " are considered native" ) }) @@ -336,7 +336,7 @@ infinity_server <- function(...) { `Recorded by` = `Recorded by`[1] ), by = .(Species, - `Establishment means`, + `Establishment Means`, `Voucher type` = `Voucher Type`)] @@ -392,10 +392,8 @@ infinity_server <- function(...) { output$downloadData <- downloadHandler( filename = function() { name_bits <- gsub(".parquet", "", input$ala_path) - name_bits <- gsub("Australia-", "", name_bits) - paste(input$place, - "-", - input$buffer_size, + name_bits <- gsub("infinitylists-", "", name_bits) + paste(input$buffer_size, "m-buffer-", name_bits, ".csv", diff --git a/R/ui.R b/R/ui.R index 2b5f549..a49412d 100644 --- a/R/ui.R +++ b/R/ui.R @@ -8,8 +8,7 @@ ui <- function(){ files_in_directory <- list.files(path = system.file(package = "infinitylists", "data/"), pattern = ".parquet") - taxa_names <- - gsub("Australia-(.+?)-[0-9]{4}-[0-9]{2}-[0-9]{2}.parquet", + taxa_names <- gsub("-[0-9]{4}-[0-9]{2}-[0-9]{2}.parquet", "\\1", files_in_directory) diff --git a/README.Rmd b/README.Rmd index 21e7e72..b2cfb9d 100644 --- a/README.Rmd +++ b/README.Rmd @@ -64,10 +64,15 @@ galah::galah_config(email = "YOUR EMAIL HERE") download_ala_obs(taxon = "Orthoptera") infinitylistApp() +``` +## Adapt infinitylist for other countries -``` +We have developed functions to assist users to create their own infinitylist for their chosen taxa and country. Check out the vignette which shows you how to do so! +```{r} +vignette("diy") +``` ## Why did I get disconnected from the server? diff --git a/README.md b/README.md index 25825d8..d27135e 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,16 @@ download_ala_obs(taxon = "Orthoptera") infinitylistApp() ``` +## Adapt infinitylist for other countries + +We have developed functions to assist users to create their own +infinitylist for their chosen taxa and country. Check out the vignette +which shows you how to do so! + +``` r +vignette("diy") +``` + ## Why did I get disconnected from the server? If `infinitylists` is left open but idle in your browser for too long, diff --git a/man/download_gbif_obs.Rd b/man/download_gbif_obs.Rd new file mode 100644 index 0000000..27583cf --- /dev/null +++ b/man/download_gbif_obs.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gbif_download.R +\name{download_gbif_obs} +\alias{download_gbif_obs} +\title{Download and Process GBIF Observations} +\usage{ +download_gbif_obs( + taxon, + min_year = 1923, + max_year = as.numeric(format(Sys.Date(), "\%Y")), + country_code = NULL, + save_raw_data = FALSE, + output_dir = file.path(system.file(package = "infinitylists"), "data/") +) +} +\arguments{ +\item{taxon}{Character. The taxon (species, genus, etc.) for which to retrieve GBIF data.} + +\item{min_year}{Numeric. The minimum year for the observations to be retrieved. Default is 1923.} + +\item{max_year}{Numeric. The maximum year for the observations to be retrieved. Default is the current year.} + +\item{country_code}{Character. The ISO 3166-1 alpha-2 country code to filter observations by country. Default is NULL (no country filter).} + +\item{save_raw_data}{Logical. Whether to save the raw data retrieved from GBIF. Default is FALSE.} + +\item{output_dir}{Character. The directory where the processed data will be saved. Default is a "data" directory within the "infinitylists" package.} +} +\value{ +None. The function saves the processed data to the specified output directory. +} +\description{ +This function retrieves, processes, and saves GBIF (Global Biodiversity Information Facility) observation data for a specified taxon. +} diff --git a/man/query_gbif_global.Rd b/man/query_gbif_global.Rd new file mode 100644 index 0000000..b678125 --- /dev/null +++ b/man/query_gbif_global.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gbif_download.R +\name{query_gbif_global} +\alias{query_gbif_global} +\title{Generate galah query for Global GBIF Node} +\usage{ +query_gbif_global(taxon, min_year, max_year, country_code = NULL) +} +\arguments{ +\item{taxon}{character, genus/family/kingdom} + +\item{min_year}{numeric, year cut off for query, only records where year >= min_year will be included} + +\item{max_year}{numeric, year cut off for query, only records where year <= max_year will be included} + +\item{country_code}{character, code for country} +} +\description{ +Generate galah query for Global GBIF Node +} diff --git a/tests/testthat/_snaps/galah_download.md b/tests/testthat/_snaps/galah_download.md index b23c5c4..9631413 100644 --- a/tests/testthat/_snaps/galah_download.md +++ b/tests/testthat/_snaps/galah_download.md @@ -14,5 +14,5 @@ 6 Hemico~ Hemi~ Cordu~ 1924-10-10 00:00:00 -28.7 152. Collection QM 7 Synthe~ Synt~ Synth~ 1924-01-01 00:00:00 -41.9 145. Collection QM # i 4 more variables: `Recorded by` , `Record Id` , Link , - # `Establishment means` + # `Establishment Means` diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..097b241 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/diy.Rmd b/vignettes/diy.Rmd new file mode 100644 index 0000000..7ce933b --- /dev/null +++ b/vignettes/diy.Rmd @@ -0,0 +1,138 @@ +--- +title: "Create your own infinitylists" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{DIY infinitylists} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +One major benefit of infinitylists using a Living Atlas node is that is gives users the ability to create their own version of infinitylists for whichever Living Atlas you would like to use. Unfortunately, there are some slight inconsistencies in data coverage and naming between Living Atlas data providers. This makes creating your own infinitylists not an entirely straightforward process but I hope this article will be able to give you some guidance. + +Here, I will walk through the process on how to adapt the source code of infinitylists so you can create your own version of infinitylists for any country and taxa of your choice. + +If you have any questions about this process, please do not hesitate and reach out but submitting an issue at the [infinitylists repository](https://github.com/traitecoevo/infinitylists). + +### Load dependencies + +We are going to need a few packages to create your own infinitylists. Go ahead and install these if you don't have these in your version of R. Otherwise, load them and we can get started + + +``` r +# install.packages("devtools") +devtools::install_github("traitecoevo/infinitylists") +library(infinitylists) +library(galah) +library(arrow) +library(tidyverse) +``` + +You will need to register for a [GBIF account](https://www.gbif.org/). Click on the "Login" button on the top left corner and click on the "Register" tab. Note down your login credentials for safe-keeping once you have verified your account and created a password. + +### Configure galah + +We will be using [{galah}](https://galah.ala.org.au/R/) to download occurrence records used our infinitylist. To do so, we need to configure the settings so the package knows to point to the Global GBIF API. + +Here I've saved the credentials in my R environment so its not shared publicly. I can call on these environment variables using `Sys.getenv()`. You can also do so with `usethis::edit_r_environ`. + + +``` r +# Set atlas +galah_config( + atlas = "Global", + username = Sys.getenv("GBIF_USERNAME"), + password = Sys.getenv("GBIF_PWD"), + email = Sys.getenv("GBIF_EMAIL") +) + +``` + +### Submit data request + +Once we have all that set up, we can request data from GBIF Global. Here I am downloading records for the skink genus 'Podarcis', from years 2000 to 2004. Under `country_code`, I've specified `"FR"` for records found in France. Here is a [list of codes](https://en.wikipedia.org/wiki/ISO_3166-2) for each country. The `download_gbif_obs` function will download the records and say it internally inside the infinitylist R package so you can use it immediately. + +Note that depending on how many records are requested, the download will take some time. + + +``` r +download_gbif_obs("Podarcis", + min_year = 2000, + max_year = 2024, + country_code = "FR") +``` + +### Pre-download check + +You can check roughly how big your download is but using the `query()` function with `galah::atlas_counts()`. Note this will not be the find number of records that goes into infinitylist as we do further exclusions and data cleaning behind the scenes. + + +``` r +query_gbif_global("Podarcis", + min_year = 2000, + max_year = 2024, + country_code = "FR") |> + galah::atlas_counts() +``` + +You can investigate the full download by specifying `save_raw_data = TRUE` in `download_gbif_obs()` + +### Launch infinitylist and explore! + +Once the download is complete, you are all set! Launch infinitylist and you will find your download under the dropdown menu "taxa" + + +``` r +infinitylistApp() +``` + +### Open downloaded data + +The following code identifies the file path of where your GBIF Global records are downloaded if you want to open data in R or export it for other uses. This is usually handy if you want to orientate the map to where your download is from using `"Choose a lat/long"`. + +In the next code chunk, replace `"Podarcis"` in the `pattern` argument with the name of the taxa you have downloaded data for in the previous step. This code will provide the full file paths of objects that match the `pattern` argument. + +If you specified `save_raw_data = TRUE` in `download_gbif_obs()`, this code will you two file paths. The file with the prefix: + +- `"GBIF-preprocessed-"` is the raw download **before** our data cleaning. +- `"Living-Atlas-"` is the final **cleaned** download of the data you view in the app. + + +``` r +# Locate file path of downloads +system.file(package = "infinitylists") |> + file.path("data") |> + list.files(pattern = "Podarcis", full.names = TRUE) # Match for Podarcis +``` + +Copy the file path and pasted it in the `read_parquet()` function to open the download in R. + + +``` r +gbif_podarcis <- arrow::read_parquet("infinitylists/inst/data/Living-Atlas-Podarcis-2024-09-13.parquet") +``` + + + + +``` r +gbif_podarcis |> print(n = 10) +#> # A tibble: 6,155 × 11 +#> Species Genus Family `Collection Date` Lat Long `Voucher Type` Repository +#> * +#> 1 Podarcis tiliguer… Poda… Lacer… 2020-10-18 09:20:00 41.6 8.81 Photograph https://w… +#> 2 Podarcis tiliguer… Poda… Lacer… 2024-07-01 14:41:00 42.3 8.87 Photograph https://w… +#> 3 Podarcis tiliguer… Poda… Lacer… 2022-05-30 00:00:00 41.8 9.23 Photograph https://w… +#> 4 Podarcis siculus Poda… Lacer… 2024-05-17 10:37:00 42.8 9.48 Photograph https://w… +#> 5 Podarcis muralis Poda… Lacer… 2023-07-28 17:05:00 43.5 -1.52 Photograph https://w… +#> 6 Podarcis liolepis Poda… Lacer… 2020-04-08 00:00:00 44.0 3.52 Photograph https://w… +#> 7 Podarcis liolepis Poda… Lacer… 2014-05-09 08:44:00 43.6 3.02 Photograph https://w… +#> 8 Podarcis muralis Poda… Lacer… 2023-05-14 19:55:00 47.6 1.34 Photograph https://w… +#> 9 Podarcis muralis Poda… Lacer… 2023-10-14 12:39:00 44.6 6.17 Photograph https://w… +#> 10 Podarcis muralis Poda… Lacer… 2020-06-18 04:07:00 43.9 -1.38 Photograph https://w… +#> # ℹ 6,145 more rows +#> # ℹ 3 more variables: `Recorded by` , `Establishment Means` , Link +``` + + diff --git a/vignettes/diy.Rmd.orig b/vignettes/diy.Rmd.orig new file mode 100644 index 0000000..e53eba9 --- /dev/null +++ b/vignettes/diy.Rmd.orig @@ -0,0 +1,130 @@ +--- +title: "Create your own infinitylists" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{DIY infinitylists} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval = FALSE +) + +library(infinitylists) +library(galah) +library(arrow) +library(dplyr) + +# knitr::knit("vignettes/diy.Rmd.orig", output = "vignettes/diy.Rmd") +``` + +One major benefit of infinitylists using a Living Atlas node is that is gives users the ability to create their own version of infinitylists for whichever Living Atlas you would like to use. Unfortunately, there are some slight inconsistencies in data coverage and naming between Living Atlas data providers. This makes creating your own infinitylists not an entirely straightforward process but I hope this article will be able to give you some guidance. + +Here, I will walk through the process on how to adapt the source code of infinitylists so you can create your own version of infinitylists for any country and taxa of your choice. + +If you have any questions about this process, please do not hesitate and reach out but submitting an issue at the [infinitylists repository](XX). + +### Load dependencies + +We are going to need a few packages to create your own infinitylists. Go ahead and install these if you don't have these in your version of R. Otherwise, load them and we can get started + +```{r setup} +# install.packages("devtools") +devtools::install_github("traitecoevo/infinitylists") +library(infinitylists) +library(galah) +library(arrow) +library(tidyverse) +``` + +You will need to register for a [GBIF account](https://www.gbif.org/). Click on the "Login" button on the top left corner and click on the "Register" tab. Note down your login credentials for safe-keeping once you have verified your account and created a password. + +### Configure galah + +We will be using [{galah}](https://galah.ala.org.au/R/) to download occurrence records used our infinitylist. To do so, we need to configure the settings so the package knows to point to the Global GBIF API. + +Here I've saved the credentials in my R environment so its not shared publicly. I can call on these environment variables using `Sys.getenv()`. You can also do so with `usethis::edit_r_environ`. + +```{r} +# Set atlas +galah_config( + atlas = "Global", + username = Sys.getenv("GBIF_USERNAME"), + password = Sys.getenv("GBIF_PWD"), + email = Sys.getenv("GBIF_EMAIL") +) + +``` + +### Submit data request + +Once we have all that set up, we can request data from GBIF Global. Here I am downloading records for the skink genus 'Podarcis', from years 2000 to 2004. Under `country_code`, I've specified `"FR"` for records found in France. Here is a [list of codes](https://en.wikipedia.org/wiki/ISO_3166-2) for each country. The `download_gbif_obs` function will download the records and say it internally inside the infinitylist R package so you can use it immediately. + +Note that depending on how many records are requested, the download will take some time. + +```{r} +download_gbif_obs("Podarcis", + min_year = 2000, + max_year = 2024, + country_code = "FR") +``` + +### Pre-download check + +You can check roughly how big your download is but using the `query()` function with `galah::atlas_counts()`. Note this will not be the find number of records that goes into infinitylist as we do further exclusions and data cleaning behind the scenes. + +```{r} +query_gbif_global("Podarcis", + min_year = 2000, + max_year = 2024, + country_code = "FR") |> + galah::atlas_counts() +``` + +You can investigate the full download by specifying `save_raw_data = TRUE` in `download_gbif_obs()` + +### Launch infinitylist and explore! + +Once the download is complete, you are all set! Launch infinitylist and you will find your download under the dropdown menu "taxa" + +```{r} +infinitylistApp() +``` + +### Open downloaded data + +The following code identifies the file path of where your GBIF Global records are downloaded if you want to open data in R or export it for other uses. This is usually handy if you want to orientate the map to where your download is from using `"Choose a lat/long"`. + +In the next code chunk, replace `"Podarcis"` in the `pattern` argument with the name of the taxa you have downloaded data for in the previous step. This code will provide the full file paths of objects that match the `pattern` argument. + +If you specified `save_raw_data = TRUE` in `download_gbif_obs()`, this code will you two file paths. The file with the prefix: + +- `"GBIF-preprocessed-"` is the raw download **before** our data cleaning. +- `"Living-Atlas-"` is the final **cleaned** download of the data you view in the app. + +```{r} +# Locate file path of downloads +system.file(package = "infinitylists") |> + file.path("data") |> + list.files(pattern = "Podarcis", full.names = TRUE) # Match for Podarcis +``` + +Copy the file path and pasted it in the `read_parquet()` function to open the download in R. + +```{r} +gbif_podarcis <- arrow::read_parquet("infinitylists/inst/data/Living-Atlas-Podarcis-2024-09-13.parquet") +``` + +```{r include=FALSE, eval=TRUE} +gbif_podarcis <- arrow::read_parquet(here::here("inst/data/Living-Atlas-Podarcis-2024-09-13.parquet")) +``` + +```{r, eval=TRUE} +gbif_podarcis |> print(n = 10) +``` + +