From 2582f355eceac27d6e020c1ed7535ad17d7b07f2 Mon Sep 17 00:00:00 2001 From: rqthomas Date: Fri, 1 Dec 2023 13:43:36 -0500 Subject: [PATCH] updating documentation and validator script --- R/forecast_output_validator.R | 252 +++++-------------------------- man/check_submission.Rd | 22 --- man/forecast_output_validator.Rd | 26 +--- man/noaa_stage1.Rd | 3 +- man/noaa_stage2.Rd | 3 +- man/noaa_stage3.Rd | 7 +- man/submit.Rd | 2 +- 7 files changed, 49 insertions(+), 266 deletions(-) delete mode 100644 man/check_submission.Rd diff --git a/R/forecast_output_validator.R b/R/forecast_output_validator.R index 81ff118..76fa67b 100644 --- a/R/forecast_output_validator.R +++ b/R/forecast_output_validator.R @@ -1,74 +1,31 @@ -#' forecast_output_validator +#' Validate forecast file #' -#' @param forecast_file Your forecast csv or nc file -#' @param target_variables Possible target variables -#' @param theme_names valid EFI theme names +#' @param forecast_file forecast csv or csv.gz file #' @export -#' -#' @examples -#' -#' forecast_file <- system.file("extdata/aquatics-2021-02-01-EFInull.csv.gz", -#' package = "neon4cast") -#' forecast_output_validator(forecast_file) -#' -forecast_output_validator <- function(forecast_file, - target_variables = c("oxygen", - "temperature", - "richness", - "abundance", - "nee", - "le", - "vswc", - "gcc_90", - "rcc_90", - "ixodes_scapularis", - "amblyomma_americanum", - "prediction", - "observed"), - #GENERALIZATION: Specific themes - theme_names = c("aquatics", "beetles", - "phenology", "terrestrial_30min", - "terrestrial_daily","ticks")){ + +forecast_output_validator <- function(forecast_file){ + + file_in <- forecast_file valid <- TRUE message(file_in) - #usethis::ui_todo("Checking validity of file name...") - file_basename <- basename(file_in) - parsed_basename <- unlist(stringr::str_split(file_basename, "-")) - file_name_parsable <- TRUE - - if(!(parsed_basename[1] %in% theme_names)){ - usethis::ui_warn(paste0("first position of file name (before first -) is not one of the following : ", - paste(theme_names, collapse = " "))) - valid <- FALSE - file_name_parsable <- FALSE - } - - date_string <- lubridate::as_date(paste(parsed_basename[2:4], collapse = "-")) - - if(is.na(date_string)){ - usethis::ui_warn("file name does not contain parsable date") - file_name_parsable <- FALSE - valid <- FALSE - } - - if(file_name_parsable){ - usethis::ui_done("file name is correct") - } - - if(any(vapply(c("[.]csv", "[.]csv\\.gz"), grepl, logical(1), file_in))){ + if(any(vapply(c("[.]csv", "[.]csv\\.gz"), grepl, logical(1), file_in))){ # if file is csv zip file out <- readr::read_csv(file_in, guess_max = 1e6, show_col_types = FALSE) + if(lexists(out, c("model_id"))){ + usethis::ui_done("file has model_id column") + }else{ + usethis::ui_warn("file missing model_id column ") + } + + if("variable" %in% names(out) & "prediction" %in% names(out)){ usethis::ui_done("forecasted variables found correct variable + prediction column") - }else if("variable" %in% names(out) & "predicted" %in% names(out)){ - usethis::ui_warn("file as predicted column. change column name to prediction") - valid <- FALSE }else{ usethis::ui_warn("missing the variable and prediction columns") valid <- FALSE @@ -81,199 +38,73 @@ forecast_output_validator <- function(forecast_file, valid <- FALSE }else if(lexists(out, "family")){ - if("normal" %in% unique(out$family)){ - usethis::ui_done("file has normal distribution in family column") - }else if("ensemble" %in% unique(out$family)){ - usethis::ui_done("file has ensemble distribution in family column") - }else{ - usethis::ui_warn("only normal or ensemble distributions in family columns are currently supported") - valid <- FALSE - } - if(lexists(out, "parameter")){ - if("mu" %in% unique(out$parameter) & "sigma" %in% unique(out$parameter)){ - usethis::ui_done("file has parameter and family column with normal distribution") - }else if("ensemble" %in% unique(out$family) | "sample" %in% unique(out$family) ){ - usethis::ui_done("file has parameter and family column with ensemble generated distribution") - }else{ - usethis::ui_warn("file does not have parameter column is not a normal or ensemble distribution") - valid <- FALSE - } + usethis::ui_done("file has correct family and parameter columns") }else{ - usethis::ui_warn("file does not have parameter and family column ") + usethis::ui_warn("file does not have parameter column ") valid <- FALSE } }else{ - usethis::ui_warn("file does not have ensemble or family + parameter column") + usethis::ui_warn("file does not have ensemble or family and/or parameter column") valid <- FALSE } #usethis::ui_todo("Checking that file contains siteID column...") if(lexists(out, c("site_id"))){ usethis::ui_done("file has site_id column") - }else if(lexists(out, c("siteID"))){ - usethis::ui_warn("file siteID column should be named site_id") }else{ usethis::ui_warn("file missing site_id column") } - #usethis::ui_todo("Checking that file contains parsable time column...") if(lexists(out, c("datetime"))){ - usethis::ui_done("file has time column") - if(!stringr::str_detect(out$datetime[1], "-")){ - usethis::ui_done("time column format is not in the correct YYYY-MM-DD format") + usethis::ui_done("file has datetime column") + if(!grepl("-", out$datetime[1])){ + usethis::ui_done("datetime column format is not in the correct YYYY-MM-DD format") valid <- FALSE }else{ if(sum(class(out$datetime) %in% c("Date","POSIXct")) > 0){ - usethis::ui_done("file has correct time column") + usethis::ui_done("file has correct datetime column") }else{ - usethis::ui_done("time column format is not in the correct YYYY-MM-DD format") + usethis::ui_done("datetime column format is not in the correct YYYY-MM-DD format") valid <- FALSE } } - }else if(lexists(out, c("time"))){ - usethis::ui_warn("time dimension should be named datetime. We are converting it during processing but please update your submission format") - valid <- TRUE }else{ - usethis::ui_warn("file missing time column") + usethis::ui_warn("file missing datetime column") valid <- FALSE } + + #if(lexists(out, c("duration"))){ + # usethis::ui_done("file has duration column") + #}else{ + # usethis::ui_warn("file missing duration column (values for the column: daily = P1D, hourly = PT1H)") + # valid <- FALSE + #} + + #if(lexists(out, c("project_id"))){ + # usethis::ui_done("file has project_id column") + #}else{ + # usethis::ui_warn("file missing project_id column (use `vera4cast` as the project_id") + # valid <- FALSE + #} + if(lexists(out, c("reference_datetime"))){ usethis::ui_done("file has reference_datetime column") }else if(lexists(out, c("start_time"))){ usethis::ui_warn("file start_time column should be named reference_datetime. We are converting it during processing but please update your submission format") }else{ usethis::ui_warn("file missing reference_datetime column") - } - - } else if(grepl("[.]nc", file_in)){ #if file is nc - - nc <- ncdf4::nc_open(file_in) - - #usethis::ui_todo("Checking that file contains correct variables...") - - if(lexists(nc$var, target_variables) > 0){ - usethis::ui_done("target variables found") - var_dim <- dim(ncdf4::ncvar_get(nc, varid = names(nc$var[which(names(nc$var) %in% target_variables)][1]))) - }else{ - usethis::ui_warn(paste0("no target variables in found in possible list: ", paste(target_variables, collapse = " "))) valid <- FALSE } - #usethis::ui_todo("Checking that time variable exist and is parseable...") - - if(lexists(nc$dim, c("time", "datetime"))){ - usethis::ui_done("file has time dimension") - if("time" %in% names(nc$dim)){ - usethis::ui_warn("time dimension should be named datetime we are converting it during processing but please update your submission format") - time <- ncdf4::ncvar_get(nc, "time") - time_dim <- length(time) - valid <- TRUE - }else{ - time <- ncdf4::ncvar_get(nc, "datetime") - tustr<-strsplit(ncdf4::ncatt_get(nc, varid = "datetime", "units")$value, " ") - t_string <- strsplit(ncdf4::ncatt_get(nc, varid = "datetime", "units")$value, " ")[[1]][1] - time_dim <- length(time) - time <-lubridate::as_date(time,origin=unlist(tustr)[3]) - if(t_string %in% c("days","seconds")){ - usethis::ui_done("file has correct time dimension") - }else{ - usethis::ui_warn("time dimension is in correct format") - valid <- FALSE - } - } - }else{ - usethis::ui_warn("file missing time dimension") - valid <- FALSE - } - - #usethis::ui_todo("Checking that siteID variable exists...") - #GENERALIZATION: using siteID here - should be site_id - if(lexists(nc$var, c("siteID", "site_id"))){ - usethis::ui_done("file has siteID variable") - }else{ - usethis::ui_warn("file missing siteID variable") - valid <- FALSE - } - - #usethis::ui_todo("Checking that netcdf contains site dimension...") - - if(lexists(nc$dim, c("site")) > 0){ - usethis::ui_done("file has site dimension") - site_dim <- length(ncdf4::ncvar_get(nc, "site")) - - }else{ - usethis::ui_warn("file missing site dimension") - valid <- FALSE - } - - #usethis::ui_todo("Checking that netcdf contains ensemble dimension...") - - if(lexists(nc$dim, "ensemble")){ - usethis::ui_warn("ensemble dimension should be named parameter") - ensemble_dim <- length(ncdf4::ncvar_get(nc, "ensemble")) - valid <- FALSE - }else if(lexists(nc$dim, "parameter")){ - usethis::ui_done("file has parameter dimension") - ensemble_dim <- length(ncdf4::ncvar_get(nc, "parameter")) - }else{ - usethis::ui_warn("file missing parameter dimension") - valid <- FALSE - } - - #usethis::ui_todo("Checking that netcdf dimensions are correct order...") - dim_order <- TRUE - - if(var_dim[1] != time_dim){ - usethis::ui_warn("time is not the first dimension") - valid <- FALSE - dim_order <- FALSE - } - - if(var_dim[2] != site_dim){ - usethis::ui_warn("site is not the second dimension") - valid <- FALSE - dim_order <- FALSE - } - - if(var_dim[3] != ensemble_dim){ - usethis::ui_warn("ensemble is not the third dimension") - valid <- FALSE - dim_order <- FALSE - } - - if(dim_order){ - usethis::ui_done("dimensions are correct order") - } - - ncdf4::nc_close(nc) - - }else if(grepl("[.]xml", file_in)){ #if file is eml - - #usethis::ui_todo("Checking validity of metdata...") - - #out <- EML::read_eml(file_in) - - #valid_metadata <- tryCatch(EFIstandards::forecast_validator(out),error = function(e){ - # message(e) - # return(FALSE) - #}, - #finally = NULL) - - #if(!valid_metadata){ - # usethis::ui_warn("metadata is not correct") - # valid <- FALSE - #}else{ - # usethis::ui_done("metadata is correct") - #} - valid <- TRUE }else{ + usethis::ui_warn("incorrect file extension (csv or csv.gz are accepted)") valid <- FALSE } if(!valid){ - message("Forecast file is not valid. The following link provides information about the format:\nhttps://projects.ecoforecast.org/neon4cast-docs/Submission-Instructions.html") + message("Forecast file is not valid. The following link provides information about the format:\nhttps://projects.ecoforecast.org/neon4cast-ci/instructions.html#forecast-file-format") }else{ message("Forecast format is valid") } @@ -283,5 +114,4 @@ forecast_output_validator <- function(forecast_file, lexists <- function(list,name){ any(!is.na(match(name, names(list)))) -} - +} \ No newline at end of file diff --git a/man/check_submission.Rd b/man/check_submission.Rd deleted file mode 100644 index 614be98..0000000 --- a/man/check_submission.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/submit.R -\name{check_submission} -\alias{check_submission} -\title{Check that submission was successfully processed} -\usage{ -check_submission( - forecast_file, - s3_region = "data", - s3_endpoint = "ecoforecast.org" -) -} -\arguments{ -\item{forecast_file}{Your forecast csv or nc file} - -\item{s3_region}{subdomain (leave as is for EFI challenge)} - -\item{s3_endpoint}{root domain (leave as is for EFI challenge)} -} -\description{ -Check that submission was successfully processed -} diff --git a/man/forecast_output_validator.Rd b/man/forecast_output_validator.Rd index 1e2c55f..e46fbae 100644 --- a/man/forecast_output_validator.Rd +++ b/man/forecast_output_validator.Rd @@ -2,31 +2,13 @@ % Please edit documentation in R/forecast_output_validator.R \name{forecast_output_validator} \alias{forecast_output_validator} -\title{forecast_output_validator} +\title{Validate forecast file} \usage{ -forecast_output_validator( - forecast_file, - target_variables = c("oxygen", "temperature", "richness", "abundance", "nee", "le", - "vswc", "gcc_90", "rcc_90", "ixodes_scapularis", "amblyomma_americanum", - "prediction", "observed"), - theme_names = c("aquatics", "beetles", "phenology", "terrestrial_30min", - "terrestrial_daily", "ticks") -) +forecast_output_validator(forecast_file) } \arguments{ -\item{forecast_file}{Your forecast csv or nc file} - -\item{target_variables}{Possible target variables} - -\item{theme_names}{valid EFI theme names} +\item{forecast_file}{forecast csv or csv.gz file} } \description{ -forecast_output_validator -} -\examples{ - -forecast_file <- system.file("extdata/aquatics-2021-02-01-EFInull.csv.gz", - package = "neon4cast") -forecast_output_validator(forecast_file) - +Validate forecast file } diff --git a/man/noaa_stage1.Rd b/man/noaa_stage1.Rd index bc1c7e5..42f3ddb 100644 --- a/man/noaa_stage1.Rd +++ b/man/noaa_stage1.Rd @@ -9,8 +9,7 @@ noaa_stage1( version = "v12", endpoint = "data.ecoforecast.org", verbose = TRUE, - start_date = "", - site_id = NA + start_date = "" ) } \arguments{ diff --git a/man/noaa_stage2.Rd b/man/noaa_stage2.Rd index 91ddba3..d503783 100644 --- a/man/noaa_stage2.Rd +++ b/man/noaa_stage2.Rd @@ -15,8 +15,7 @@ noaa_stage2( version = "v12", endpoint = NA, verbose = TRUE, - start_date = "", - site_id = NA + start_date = "" ) } \arguments{ diff --git a/man/noaa_stage3.Rd b/man/noaa_stage3.Rd index 8836024..41ad9bf 100644 --- a/man/noaa_stage3.Rd +++ b/man/noaa_stage3.Rd @@ -4,12 +4,7 @@ \alias{noaa_stage3} \title{NOAA GEFS forecasts with EFI stage 3 processing} \usage{ -noaa_stage3( - version = "v12", - endpoint = "data.ecoforecast.org", - verbose = TRUE, - site_id = NA -) +noaa_stage3(version = "v12", endpoint = "data.ecoforecast.org", verbose = TRUE) } \arguments{ \item{version}{GEFS forecast version. Prior versions correspond to forecasts diff --git a/man/submit.Rd b/man/submit.Rd index 297938f..608dfe0 100644 --- a/man/submit.Rd +++ b/man/submit.Rd @@ -13,7 +13,7 @@ submit( ) } \arguments{ -\item{forecast_file}{Your forecast csv or nc file} +\item{forecast_file}{forecast csv or csv.gz file} \item{metadata}{path to metadata file}