Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Workflow of adapting infinitylists for Global use via GBIF Global node #100

Merged
merged 23 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@
^\.github$
^codecov\.yml$
^inst/data$
ignore/
^ignore$





^doc$
^Meta$
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@
.DS_Store
ignore/
inst/data
inst/doc
/doc/
/Meta/
9 changes: 7 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,16 @@ Imports:
shiny,
shinybusy,
stringr,
shinythemes
shinythemes,
tidyr
Remotes:
traitecoevo/APCalign
Suggests:
job,
bsplus,
here,
job,
knitr,
rmarkdown,
testthat (>= 3.0.0)
Config/testthat/edition: 3
VignetteBuilder: knitr
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Generated by roxygen2: do not edit by hand

export(download_ala_obs)
export(download_gbif_obs)
export(infinitylistApp)
export(query_gbif_global)
import(data.table)
import(shiny)
9 changes: 4 additions & 5 deletions R/galah_download.R
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ retrieve_data_by_years <- function(taxon,
download,
paste0(
output_dir,
"ALA-Australia-",
"Living-Atlas",
taxon,
"-",
first(years),
Expand Down Expand Up @@ -189,12 +189,12 @@ get_establishment_status <- function(ala_cleaned, taxon = taxon) {
ala_cleaned$native_anywhere_in_aus <- "unknown"
}
# Rename native_anywhere_in_aus
ala_cleaned <- dplyr::rename(ala_cleaned,"Establishment means" = native_anywhere_in_aus)
ala_cleaned <- dplyr::rename(ala_cleaned,"Establishment Means" = native_anywhere_in_aus)

return(ala_cleaned)
}


#' Process downloaded data from Atlas of Living Australia
#' @noRd
process_data <- function(data) {
datasets_of_interest <- c(
Expand Down Expand Up @@ -244,7 +244,6 @@ process_data <- function(data) {
janitor::clean_names("title")
}


#' @noRd
save_data <- function(data, taxon, output_dir) {
if (!file.exists(file.path(output_dir))) {
Expand All @@ -255,7 +254,7 @@ save_data <- function(data, taxon, output_dir) {
arrow::write_parquet(x = data,
sink = file.path(
output_dir,
paste0("Australia-",
paste0("Living-Atlas-",
taxon,
"-",
Sys.Date(),
Expand Down
209 changes: 209 additions & 0 deletions R/gbif_download.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
#' Download and Process GBIF Observations
#'
#' This function retrieves, processes, and saves GBIF (Global Biodiversity Information Facility) observation data for a specified taxon.
#'
#' @param taxon Character. The taxon (species, genus, etc.) for which to retrieve GBIF data.
#' @param min_year Numeric. The minimum year for the observations to be retrieved. Default is 1923.
#' @param max_year Numeric. The maximum year for the observations to be retrieved. Default is the current year.
#' @param country_code Character. The ISO 3166-1 alpha-2 country code to filter observations by country. Default is NULL (no country filter).
#' @param save_raw_data Logical. Whether to save the raw data retrieved from GBIF. Default is FALSE.
#' @param output_dir Character. The directory where the processed data will be saved. Default is a "data" directory within the "infinitylists" package.
#'
#' @return None. The function saves the processed data to the specified output directory.
#' @export

download_gbif_obs <- function(taxon,
min_year = 1923,
max_year = as.numeric(format(Sys.Date(), "%Y")),
country_code= NULL,
save_raw_data = FALSE,
output_dir = file.path(system.file(package = "infinitylists"), "data/")){
# 1. Data retrieval
gbif_obs <-
retrieve_gbif_data(taxon, min_year, max_year, country_code, save_raw_data, output_dir)

# 2. Filtering and processing
gbif_cleaned <- suppressWarnings(gbif_process_data(gbif_obs))

# 4. Save processed data
save_data(gbif_cleaned, taxon, output_dir)
}


#' Generate galah query for Global GBIF Node
#'
#' @param taxon character, genus/family/kingdom
#' @param min_year numeric, year cut off for query, only records where year >= min_year will be included
#' @param max_year numeric, year cut off for query, only records where year <= max_year will be included
#' @param country_code character, code for country
#' @export

query_gbif_global<- function(taxon,
min_year,
max_year,
country_code = NULL){
if(is.null(country_code))
query <- galah::galah_call() |>
galah::galah_identify(taxon) |>
galah::galah_filter(
hasGeospatialIssue == "false",
year >= min_year,
year <= max_year)
else(
query <- galah::galah_call() |>
galah::galah_identify(taxon) |>
galah::galah_filter(
country == country_code,
hasGeospatialIssue == "false",
year >= min_year,
year <= max_year)
)
return(query)
}

#' Download GBIF data and save as output
#' @noRd
#' @keywords internal
retrieve_gbif_data_by_year_range <- function(taxon, min_year, max_year, country_code= NULL,
save_raw_data = FALSE, output_dir){

download <- query_gbif_global(taxon, min_year, max_year, country_code) |>
galah::atlas_occurrences()

# Save download (optional)
if (save_raw_data)
arrow::write_parquet(
download,
paste0(
output_dir,
"GBIF-preprocessed-",
taxon,
"-",
min_year,
"-",
max_year,
"-",
Sys.Date(),
".parquet"
)
)

return(format_date_as_character(download))
}

#' @noRd
#' @keywords internal
format_date_as_character <- function(data){
data |>
dplyr::mutate(eventDate = as.character(eventDate))
}


#' Retrieve GBIF records
#'
#' @param taxon
#' @param min_year
#' @param max_year
#' @param country_code
#' @param save_raw_data
#' @param output_dir
#' @noRd
#' @keywords internal

retrieve_gbif_data <- function(taxon, min_year, max_year,
country_code = NULL,
save_raw_data = FALSE,
output_dir = file.path(system.file(package = "infinitylists"), "data/")
){

n_obs <- query_gbif_global(taxon, min_year, max_year, country_code) |>
galah::atlas_counts() |>
dplyr::pull(count)

# If less than 1 mil records
if (n_obs < 1000000){
download <- retrieve_gbif_data_by_year_range(taxon,
min_year, max_year,
country_code,
save_raw_data,
# output_dir = file.path(system.file(package = "infinitylists"), "data/")
output_dir)
} else {

years <- seq(min_year, max_year)
length(years)

# Split years
year_chunks <- split(years, ceiling(seq_along(years) / 10))

# Map this
download <- purrr::map(year_chunks,
purrr::possibly(
~ retrieve_gbif_data_by_year_range(taxon,
min_year = range(.x)[1], max_year = range(.x)[2],
country_code,
save_raw_data,
# output_dir = file.path(system.file(package = "infinitylists"), "data/")
output_dir))
) |>
purrr::list_rbind()

}
return(download)
}

#' Process downloaded data from Atlas of Living Australia
#' @noRd
#' @keywords internal

gbif_process_data <- function(data){
data |>
tidyr::drop_na(decimalLatitude) |>
dplyr::filter(
basisOfRecord == "PRESERVED_SPECIMEN" |
stringr::str_detect(institutionCode, stringr::regex("inaturalist", ignore_case = TRUE)),
is.na(coordinateUncertaintyInMeters) |
coordinateUncertaintyInMeters <= 1000,
!is.na(eventDate),
!stringr::str_detect(species, "spec.$"),
!stringr::str_count(eventDate) <= 7, # Exclude strings with 7 or fewer characters, these are years or year + month e.g 2006-06 or just 2006
!stringr::str_count(eventDate) > 16
) |> # Exclude strings with greater than 16 characters - a few records had date ranges e.g. 2017-12-10T00:00Z/2017-12-23T00:00Z
dplyr::mutate(
eventDate_as_date = lubridate::as_date(eventDate), # Convert to dates
eventDate_ymd = lubridate::ymd_hm(eventDate, tz = "UTC", quiet = TRUE), # Convert dates that have time zones
collectionDate = dplyr::coalesce(eventDate_as_date, eventDate_ymd) # Put the two date columns together as one complete one.
) |>
dplyr::mutate(
repository = dplyr::case_when(grepl("inatur", occurrenceID) ~ occurrenceID, # Create Repository column, if occurrence ID contains "inatur", keep occurrenceID
TRUE ~ institutionCode), # Otherwise take institutionCode
link = dplyr::case_when(grepl("https", repository) ~ repository, # Create link
TRUE ~ paste0("https://www.gbif.org/dataset/", datasetKey)
),
sounds = dplyr::case_when( # Logical variable to determine if there voucher_type
grepl("Sound", mediaType) ~ 1,
TRUE ~ 0
),
voucher_type = dplyr::case_when(
basisOfRecord == "PRESERVED_SPECIMEN" ~ "Collection",
sounds == 1 ~ "Audio",
TRUE ~ "Photograph"
),
lat = decimalLatitude,
long = decimalLongitude,
) |>
dplyr::select(
species, genus, family,
collectionDate,
lat,
long,
voucher_type,
repository,
recordedBy,
establishmentMeans,
link
) |>
janitor::clean_names("title")
}


11 changes: 9 additions & 2 deletions R/infinitylists-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,14 @@ utils::globalVariables(
"write.csv",
"Link",
"Repository",
"Establishment means",
"repository"
"Establishment Means",
"repository",
"eventDate_as_date",
"eventDate_ymd",
"establishmentMeans",
"hasGeospatialIssue",
"link",
"country",
"count"
)
)
12 changes: 5 additions & 7 deletions R/server.R
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ infinity_server <- function(...) {
total_family <- length(unique(data$Family))

native <-
dplyr::filter(data, `Establishment means` == "native")
dplyr::filter(data, `Establishment Means` == "native")
if (nrow(native) > 0)
total_native_species <- length(unique(native$Species))
else
Expand Down Expand Up @@ -235,7 +235,7 @@ infinity_server <- function(...) {
total_species,
" species observed, ",
total_native_species,
" are considered native to Australia."
" are considered native"
)
})

Expand Down Expand Up @@ -336,7 +336,7 @@ infinity_server <- function(...) {
`Recorded by` = `Recorded by`[1]
),
by = .(Species,
`Establishment means`,
`Establishment Means`,
`Voucher type` = `Voucher Type`)]


Expand Down Expand Up @@ -392,10 +392,8 @@ infinity_server <- function(...) {
output$downloadData <- downloadHandler(
filename = function() {
name_bits <- gsub(".parquet", "", input$ala_path)
name_bits <- gsub("Australia-", "", name_bits)
paste(input$place,
"-",
input$buffer_size,
name_bits <- gsub("infinitylists-", "", name_bits)
paste(input$buffer_size,
"m-buffer-",
name_bits,
".csv",
Expand Down
3 changes: 1 addition & 2 deletions R/ui.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ ui <- function(){

files_in_directory <- list.files(path = system.file(package = "infinitylists", "data/"), pattern = ".parquet")

taxa_names <-
gsub("Australia-(.+?)-[0-9]{4}-[0-9]{2}-[0-9]{2}.parquet",
taxa_names <- gsub("-[0-9]{4}-[0-9]{2}-[0-9]{2}.parquet",
"\\1",
files_in_directory)

Expand Down
7 changes: 6 additions & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,15 @@ galah::galah_config(email = "YOUR EMAIL HERE")
download_ala_obs(taxon = "Orthoptera")

infinitylistApp()
```

## Adapt infinitylist for other countries

```
We have developed functions to assist users to create their own infinitylist for their chosen taxa and country. Check out the vignette which shows you how to do so!

```{r}
vignette("diy")
```

## Why did I get disconnected from the server?

Expand Down
Loading
Loading