Skip to content

Commit

Permalink
Workflow of adapting infinitylists for Global use via GBIF Global node (
Browse files Browse the repository at this point in the history
#100)

* Updated file naming

* Removed Australia prefix in parquet names

* Started skeleton of vignette

* Added spiel about finding matching fields

* Comments about Establishment means, outlining spatial part of the workflow and discussion about filtering in process_data()

* Minor edits and seeing through establishmeans

* Edits

* Code to translate for gbif_global

* Processing GBIF code

* Sorting out date myster

* Added gbif process code

* Commented out downloads

* Moved gbif code into its own script

* Fixed mapping for large downloads

* Added skeleton documentation

* Fixed naming of Establishment Means and saving Link in parquet for GBIF

* Added wrapper to download gbif obs but still buggy with dates

* Added documentation and suprressed warnings

* Updated documentation

* Updated doc

* Updated Global vars, updated vignette worfklow
and tests

* Updated README

* Removed name bits that were not working #85
  • Loading branch information
fontikar authored Oct 21, 2024
1 parent 90c425c commit 09d4e55
Show file tree
Hide file tree
Showing 17 changed files with 586 additions and 21 deletions.
6 changes: 5 additions & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@
^\.github$
^codecov\.yml$
^inst/data$
ignore/
^ignore$





^doc$
^Meta$
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@
.DS_Store
ignore/
inst/data
inst/doc
/doc/
/Meta/
9 changes: 7 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,16 @@ Imports:
shiny,
shinybusy,
stringr,
shinythemes
shinythemes,
tidyr
Remotes:
traitecoevo/APCalign
Suggests:
job,
bsplus,
here,
job,
knitr,
rmarkdown,
testthat (>= 3.0.0)
Config/testthat/edition: 3
VignetteBuilder: knitr
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Generated by roxygen2: do not edit by hand

export(download_ala_obs)
export(download_gbif_obs)
export(infinitylistApp)
export(query_gbif_global)
import(data.table)
import(shiny)
9 changes: 4 additions & 5 deletions R/galah_download.R
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ retrieve_data_by_years <- function(taxon,
download,
paste0(
output_dir,
"ALA-Australia-",
"Living-Atlas",
taxon,
"-",
first(years),
Expand Down Expand Up @@ -189,12 +189,12 @@ get_establishment_status <- function(ala_cleaned, taxon = taxon) {
ala_cleaned$native_anywhere_in_aus <- "unknown"
}
# Rename native_anywhere_in_aus
ala_cleaned <- dplyr::rename(ala_cleaned,"Establishment means" = native_anywhere_in_aus)
ala_cleaned <- dplyr::rename(ala_cleaned,"Establishment Means" = native_anywhere_in_aus)

return(ala_cleaned)
}


#' Process downloaded data from Atlas of Living Australia
#' @noRd
process_data <- function(data) {
datasets_of_interest <- c(
Expand Down Expand Up @@ -244,7 +244,6 @@ process_data <- function(data) {
janitor::clean_names("title")
}


#' @noRd
save_data <- function(data, taxon, output_dir) {
if (!file.exists(file.path(output_dir))) {
Expand All @@ -255,7 +254,7 @@ save_data <- function(data, taxon, output_dir) {
arrow::write_parquet(x = data,
sink = file.path(
output_dir,
paste0("Australia-",
paste0("Living-Atlas-",
taxon,
"-",
Sys.Date(),
Expand Down
209 changes: 209 additions & 0 deletions R/gbif_download.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
#' Download and Process GBIF Observations
#'
#' This function retrieves, processes, and saves GBIF (Global Biodiversity Information Facility) observation data for a specified taxon.
#'
#' @param taxon Character. The taxon (species, genus, etc.) for which to retrieve GBIF data.
#' @param min_year Numeric. The minimum year for the observations to be retrieved. Default is 1923.
#' @param max_year Numeric. The maximum year for the observations to be retrieved. Default is the current year.
#' @param country_code Character. The ISO 3166-1 alpha-2 country code to filter observations by country. Default is NULL (no country filter).
#' @param save_raw_data Logical. Whether to save the raw data retrieved from GBIF. Default is FALSE.
#' @param output_dir Character. The directory where the processed data will be saved. Default is a "data" directory within the "infinitylists" package.
#'
#' @return None. The function saves the processed data to the specified output directory.
#' @export

download_gbif_obs <- function(taxon,
min_year = 1923,
max_year = as.numeric(format(Sys.Date(), "%Y")),
country_code= NULL,
save_raw_data = FALSE,
output_dir = file.path(system.file(package = "infinitylists"), "data/")){
# 1. Data retrieval
gbif_obs <-
retrieve_gbif_data(taxon, min_year, max_year, country_code, save_raw_data, output_dir)

# 2. Filtering and processing
gbif_cleaned <- suppressWarnings(gbif_process_data(gbif_obs))

# 4. Save processed data
save_data(gbif_cleaned, taxon, output_dir)
}


#' Generate galah query for Global GBIF Node
#'
#' @param taxon character, genus/family/kingdom
#' @param min_year numeric, year cut off for query, only records where year >= min_year will be included
#' @param max_year numeric, year cut off for query, only records where year <= max_year will be included
#' @param country_code character, code for country
#' @export

query_gbif_global<- function(taxon,
min_year,
max_year,
country_code = NULL){
if(is.null(country_code))
query <- galah::galah_call() |>
galah::galah_identify(taxon) |>
galah::galah_filter(
hasGeospatialIssue == "false",
year >= min_year,
year <= max_year)
else(
query <- galah::galah_call() |>
galah::galah_identify(taxon) |>
galah::galah_filter(
country == country_code,
hasGeospatialIssue == "false",
year >= min_year,
year <= max_year)
)
return(query)
}

#' Download GBIF data and save as output
#' @noRd
#' @keywords internal
retrieve_gbif_data_by_year_range <- function(taxon, min_year, max_year, country_code= NULL,
save_raw_data = FALSE, output_dir){

download <- query_gbif_global(taxon, min_year, max_year, country_code) |>
galah::atlas_occurrences()

# Save download (optional)
if (save_raw_data)
arrow::write_parquet(
download,
paste0(
output_dir,
"GBIF-preprocessed-",
taxon,
"-",
min_year,
"-",
max_year,
"-",
Sys.Date(),
".parquet"
)
)

return(format_date_as_character(download))
}

#' @noRd
#' @keywords internal
format_date_as_character <- function(data){
data |>
dplyr::mutate(eventDate = as.character(eventDate))
}


#' Retrieve GBIF records
#'
#' @param taxon
#' @param min_year
#' @param max_year
#' @param country_code
#' @param save_raw_data
#' @param output_dir
#' @noRd
#' @keywords internal

retrieve_gbif_data <- function(taxon, min_year, max_year,
country_code = NULL,
save_raw_data = FALSE,
output_dir = file.path(system.file(package = "infinitylists"), "data/")
){

n_obs <- query_gbif_global(taxon, min_year, max_year, country_code) |>
galah::atlas_counts() |>
dplyr::pull(count)

# If less than 1 mil records
if (n_obs < 1000000){
download <- retrieve_gbif_data_by_year_range(taxon,
min_year, max_year,
country_code,
save_raw_data,
# output_dir = file.path(system.file(package = "infinitylists"), "data/")
output_dir)
} else {

years <- seq(min_year, max_year)
length(years)

# Split years
year_chunks <- split(years, ceiling(seq_along(years) / 10))

# Map this
download <- purrr::map(year_chunks,
purrr::possibly(
~ retrieve_gbif_data_by_year_range(taxon,
min_year = range(.x)[1], max_year = range(.x)[2],
country_code,
save_raw_data,
# output_dir = file.path(system.file(package = "infinitylists"), "data/")
output_dir))
) |>
purrr::list_rbind()

}
return(download)
}

#' Process downloaded data from Atlas of Living Australia
#' @noRd
#' @keywords internal

gbif_process_data <- function(data){
data |>
tidyr::drop_na(decimalLatitude) |>
dplyr::filter(
basisOfRecord == "PRESERVED_SPECIMEN" |
stringr::str_detect(institutionCode, stringr::regex("inaturalist", ignore_case = TRUE)),
is.na(coordinateUncertaintyInMeters) |
coordinateUncertaintyInMeters <= 1000,
!is.na(eventDate),
!stringr::str_detect(species, "spec.$"),
!stringr::str_count(eventDate) <= 7, # Exclude strings with 7 or fewer characters, these are years or year + month e.g 2006-06 or just 2006
!stringr::str_count(eventDate) > 16
) |> # Exclude strings with greater than 16 characters - a few records had date ranges e.g. 2017-12-10T00:00Z/2017-12-23T00:00Z
dplyr::mutate(
eventDate_as_date = lubridate::as_date(eventDate), # Convert to dates
eventDate_ymd = lubridate::ymd_hm(eventDate, tz = "UTC", quiet = TRUE), # Convert dates that have time zones
collectionDate = dplyr::coalesce(eventDate_as_date, eventDate_ymd) # Put the two date columns together as one complete one.
) |>
dplyr::mutate(
repository = dplyr::case_when(grepl("inatur", occurrenceID) ~ occurrenceID, # Create Repository column, if occurrence ID contains "inatur", keep occurrenceID
TRUE ~ institutionCode), # Otherwise take institutionCode
link = dplyr::case_when(grepl("https", repository) ~ repository, # Create link
TRUE ~ paste0("https://www.gbif.org/dataset/", datasetKey)
),
sounds = dplyr::case_when( # Logical variable to determine if there voucher_type
grepl("Sound", mediaType) ~ 1,
TRUE ~ 0
),
voucher_type = dplyr::case_when(
basisOfRecord == "PRESERVED_SPECIMEN" ~ "Collection",
sounds == 1 ~ "Audio",
TRUE ~ "Photograph"
),
lat = decimalLatitude,
long = decimalLongitude,
) |>
dplyr::select(
species, genus, family,
collectionDate,
lat,
long,
voucher_type,
repository,
recordedBy,
establishmentMeans,
link
) |>
janitor::clean_names("title")
}


11 changes: 9 additions & 2 deletions R/infinitylists-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,14 @@ utils::globalVariables(
"write.csv",
"Link",
"Repository",
"Establishment means",
"repository"
"Establishment Means",
"repository",
"eventDate_as_date",
"eventDate_ymd",
"establishmentMeans",
"hasGeospatialIssue",
"link",
"country",
"count"
)
)
12 changes: 5 additions & 7 deletions R/server.R
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ infinity_server <- function(...) {
total_family <- length(unique(data$Family))

native <-
dplyr::filter(data, `Establishment means` == "native")
dplyr::filter(data, `Establishment Means` == "native")
if (nrow(native) > 0)
total_native_species <- length(unique(native$Species))
else
Expand Down Expand Up @@ -235,7 +235,7 @@ infinity_server <- function(...) {
total_species,
" species observed, ",
total_native_species,
" are considered native to Australia."
" are considered native"
)
})

Expand Down Expand Up @@ -336,7 +336,7 @@ infinity_server <- function(...) {
`Recorded by` = `Recorded by`[1]
),
by = .(Species,
`Establishment means`,
`Establishment Means`,
`Voucher type` = `Voucher Type`)]


Expand Down Expand Up @@ -392,10 +392,8 @@ infinity_server <- function(...) {
output$downloadData <- downloadHandler(
filename = function() {
name_bits <- gsub(".parquet", "", input$ala_path)
name_bits <- gsub("Australia-", "", name_bits)
paste(input$place,
"-",
input$buffer_size,
name_bits <- gsub("infinitylists-", "", name_bits)
paste(input$buffer_size,
"m-buffer-",
name_bits,
".csv",
Expand Down
3 changes: 1 addition & 2 deletions R/ui.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ ui <- function(){

files_in_directory <- list.files(path = system.file(package = "infinitylists", "data/"), pattern = ".parquet")

taxa_names <-
gsub("Australia-(.+?)-[0-9]{4}-[0-9]{2}-[0-9]{2}.parquet",
taxa_names <- gsub("-[0-9]{4}-[0-9]{2}-[0-9]{2}.parquet",
"\\1",
files_in_directory)

Expand Down
7 changes: 6 additions & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,15 @@ galah::galah_config(email = "YOUR EMAIL HERE")
download_ala_obs(taxon = "Orthoptera")
infinitylistApp()
```

## Adapt infinitylist for other countries

```
We have developed functions to assist users to create their own infinitylist for their chosen taxa and country. Check out the vignette which shows you how to do so!

```{r}
vignette("diy")
```

## Why did I get disconnected from the server?

Expand Down
Loading

0 comments on commit 09d4e55

Please sign in to comment.