From 2582f355eceac27d6e020c1ed7535ad17d7b07f2 Mon Sep 17 00:00:00 2001
From: rqthomas <rqthomas@vt.edu>
Date: Fri, 1 Dec 2023 13:43:36 -0500
Subject: [PATCH] updating documentation and validator script

---
 R/forecast_output_validator.R    | 252 +++++--------------------------
 man/check_submission.Rd          |  22 ---
 man/forecast_output_validator.Rd |  26 +---
 man/noaa_stage1.Rd               |   3 +-
 man/noaa_stage2.Rd               |   3 +-
 man/noaa_stage3.Rd               |   7 +-
 man/submit.Rd                    |   2 +-
 7 files changed, 49 insertions(+), 266 deletions(-)
 delete mode 100644 man/check_submission.Rd

diff --git a/R/forecast_output_validator.R b/R/forecast_output_validator.R
index 81ff118..76fa67b 100644
--- a/R/forecast_output_validator.R
+++ b/R/forecast_output_validator.R
@@ -1,74 +1,31 @@
-#' forecast_output_validator
+#' Validate forecast file
 #'
-#' @param forecast_file Your forecast csv or nc file
-#' @param target_variables  Possible target variables
-#' @param theme_names valid EFI theme names
+#' @param forecast_file forecast csv or csv.gz file
 #' @export
-#' 
-#' @examples 
-#' 
-#' forecast_file <- system.file("extdata/aquatics-2021-02-01-EFInull.csv.gz", 
-#'                               package = "neon4cast")
-#' forecast_output_validator(forecast_file)
-#' 
-forecast_output_validator <- function(forecast_file, 
-                                      target_variables = c("oxygen", 
-                                                           "temperature", 
-                                                           "richness",
-                                                           "abundance", 
-                                                           "nee",
-                                                           "le", 
-                                                           "vswc",
-                                                           "gcc_90",
-                                                           "rcc_90",
-                                                           "ixodes_scapularis", 
-                                                           "amblyomma_americanum",
-                                                           "prediction",
-                                                           "observed"),
-                                      #GENERALIZATION:  Specific themes
-                                      theme_names = c("aquatics", "beetles",
-                                                      "phenology", "terrestrial_30min",
-                                                      "terrestrial_daily","ticks")){
+
+forecast_output_validator <- function(forecast_file){
+  
+  
   file_in <- forecast_file
   
   valid <- TRUE
   
   message(file_in)
   
-  #usethis::ui_todo("Checking validity of file name...")
-  file_basename <- basename(file_in)
-  parsed_basename <- unlist(stringr::str_split(file_basename, "-"))
-  file_name_parsable <- TRUE
-  
-  if(!(parsed_basename[1] %in% theme_names)){
-    usethis::ui_warn(paste0("first position of file name (before first -) is not one of the following : ",
-                            paste(theme_names, collapse = " ")))
-    valid <- FALSE
-    file_name_parsable <- FALSE
-  }
-  
-  date_string <- lubridate::as_date(paste(parsed_basename[2:4], collapse = "-"))
-  
-  if(is.na(date_string)){
-    usethis::ui_warn("file name does not contain parsable date")
-    file_name_parsable <- FALSE
-    valid <- FALSE
-  }
-  
-  if(file_name_parsable){
-    usethis::ui_done("file name is correct")
-  }
-  
-  if(any(vapply(c("[.]csv", "[.]csv\\.gz"), grepl, logical(1), file_in))){ 
+  if(any(vapply(c("[.]csv", "[.]csv\\.gz"), grepl, logical(1), file_in))){
     
     # if file is csv zip file
     out <- readr::read_csv(file_in, guess_max = 1e6, show_col_types = FALSE)
     
+    if(lexists(out, c("model_id"))){
+      usethis::ui_done("file has model_id column")
+    }else{
+      usethis::ui_warn("file missing model_id column ")
+    }
+    
+    
     if("variable" %in% names(out) & "prediction" %in% names(out)){
       usethis::ui_done("forecasted variables found correct variable + prediction column")
-    }else if("variable" %in% names(out) & "predicted" %in% names(out)){
-      usethis::ui_warn("file as predicted column.  change column name to prediction")
-      valid <- FALSE
     }else{
       usethis::ui_warn("missing the variable and prediction columns")
       valid <- FALSE
@@ -81,199 +38,73 @@ forecast_output_validator <- function(forecast_file,
       valid <- FALSE
     }else if(lexists(out, "family")){
       
-      if("normal" %in% unique(out$family)){
-        usethis::ui_done("file has normal distribution in family column")
-      }else if("ensemble" %in% unique(out$family)){
-        usethis::ui_done("file has ensemble distribution in family column")
-      }else{
-        usethis::ui_warn("only normal or ensemble distributions in family columns are currently supported")
-        valid <- FALSE
-      }
-      
       if(lexists(out, "parameter")){
-        if("mu" %in% unique(out$parameter) & "sigma" %in% unique(out$parameter)){
-          usethis::ui_done("file has parameter and family column with normal distribution")
-        }else if("ensemble" %in% unique(out$family) | "sample" %in% unique(out$family) ){
-          usethis::ui_done("file has parameter and family column with ensemble generated distribution")
-        }else{
-          usethis::ui_warn("file does not have parameter column is not a normal or ensemble distribution")
-          valid <- FALSE
-        }
+        usethis::ui_done("file has correct family and parameter columns")
       }else{
-        usethis::ui_warn("file does not have parameter and family column ")
+        usethis::ui_warn("file does not have parameter column ")
         valid <- FALSE
       }
       
     }else{
-      usethis::ui_warn("file does not have ensemble or family + parameter column")
+      usethis::ui_warn("file does not have ensemble or family and/or parameter column")
       valid <- FALSE
     }
     
     #usethis::ui_todo("Checking that file contains siteID column...")
     if(lexists(out, c("site_id"))){
       usethis::ui_done("file has site_id column")
-    }else if(lexists(out, c("siteID"))){
-      usethis::ui_warn("file siteID column should be named site_id")
     }else{
       usethis::ui_warn("file missing site_id column")
     }
     
-    #usethis::ui_todo("Checking that file contains parsable time column...")
     if(lexists(out, c("datetime"))){
-      usethis::ui_done("file has time column")
-      if(!stringr::str_detect(out$datetime[1], "-")){
-        usethis::ui_done("time column format is not in the correct YYYY-MM-DD format")
+      usethis::ui_done("file has datetime column")
+      if(!grepl("-", out$datetime[1])){
+        usethis::ui_done("datetime column format is not in the correct YYYY-MM-DD format")
         valid <- FALSE
       }else{
         if(sum(class(out$datetime) %in% c("Date","POSIXct")) > 0){
-          usethis::ui_done("file has correct time column")
+          usethis::ui_done("file has correct datetime column")
         }else{
-          usethis::ui_done("time column format is not in the correct YYYY-MM-DD format")
+          usethis::ui_done("datetime column format is not in the correct YYYY-MM-DD format")
           valid <- FALSE
         }
       }
-    }else if(lexists(out, c("time"))){
-      usethis::ui_warn("time dimension should be named datetime. We are converting it during processing but please update your submission format")
-      valid <- TRUE
     }else{
-      usethis::ui_warn("file missing time column")
+      usethis::ui_warn("file missing datetime column")
       valid <- FALSE
     }
     
+    
+    #if(lexists(out, c("duration"))){
+    #  usethis::ui_done("file has duration column")
+    #}else{
+    #  usethis::ui_warn("file missing duration column (values for the column: daily = P1D, hourly = PT1H)")
+    #  valid <- FALSE
+    #}
+    
+    #if(lexists(out, c("project_id"))){
+    #  usethis::ui_done("file has project_id column")
+    #}else{
+    #  usethis::ui_warn("file missing project_id column (use `vera4cast` as the project_id")
+    #  valid <- FALSE
+    #}
+    
     if(lexists(out, c("reference_datetime"))){
       usethis::ui_done("file has reference_datetime column")
     }else if(lexists(out, c("start_time"))){
       usethis::ui_warn("file start_time column should be named reference_datetime. We are converting it during processing but please update your submission format")
     }else{
       usethis::ui_warn("file missing reference_datetime column")
-    }
-    
-  } else if(grepl("[.]nc", file_in)){ #if file is nc
-    
-    nc <- ncdf4::nc_open(file_in)
-    
-    #usethis::ui_todo("Checking that file contains correct variables...")
-    
-    if(lexists(nc$var, target_variables) > 0){
-      usethis::ui_done("target variables found")
-      var_dim <- dim(ncdf4::ncvar_get(nc, varid = names(nc$var[which(names(nc$var) %in% target_variables)][1])))
-    }else{
-      usethis::ui_warn(paste0("no target variables in found in possible list: ", paste(target_variables, collapse = " ")))
       valid <- FALSE
     }
     
-    #usethis::ui_todo("Checking that time variable exist and is parseable...")
-    
-    if(lexists(nc$dim, c("time", "datetime"))){
-      usethis::ui_done("file has time dimension")
-      if("time" %in% names(nc$dim)){
-        usethis::ui_warn("time dimension should be named datetime we are converting it during processing but please update your submission format")
-        time <- ncdf4::ncvar_get(nc, "time")
-        time_dim <- length(time)
-        valid <- TRUE
-      }else{
-        time <- ncdf4::ncvar_get(nc, "datetime")
-        tustr<-strsplit(ncdf4::ncatt_get(nc, varid = "datetime", "units")$value, " ")
-        t_string <- strsplit(ncdf4::ncatt_get(nc, varid = "datetime", "units")$value, " ")[[1]][1]
-        time_dim <- length(time)
-        time <-lubridate::as_date(time,origin=unlist(tustr)[3])
-        if(t_string %in% c("days","seconds")){
-          usethis::ui_done("file has correct time dimension")
-        }else{
-          usethis::ui_warn("time dimension is in correct format")
-          valid <- FALSE
-        }
-      }
-    }else{
-      usethis::ui_warn("file missing time dimension")
-      valid <- FALSE
-    }
-    
-    #usethis::ui_todo("Checking that siteID variable exists...")
-    #GENERALIZATION: using siteID here - should be site_id
-    if(lexists(nc$var, c("siteID", "site_id"))){
-      usethis::ui_done("file has siteID variable")
-    }else{
-      usethis::ui_warn("file missing siteID variable")
-      valid <- FALSE
-    }
-    
-    #usethis::ui_todo("Checking that netcdf contains site dimension...")
-    
-    if(lexists(nc$dim, c("site")) > 0){
-      usethis::ui_done("file has site dimension")
-      site_dim <- length(ncdf4::ncvar_get(nc, "site"))
-      
-    }else{
-      usethis::ui_warn("file missing site dimension")
-      valid <- FALSE
-    }
-    
-    #usethis::ui_todo("Checking that netcdf contains ensemble dimension...")
-    
-    if(lexists(nc$dim, "ensemble")){
-      usethis::ui_warn("ensemble dimension should be named parameter")
-      ensemble_dim <- length(ncdf4::ncvar_get(nc, "ensemble"))
-      valid <- FALSE
-    }else if(lexists(nc$dim, "parameter")){
-      usethis::ui_done("file has parameter dimension")
-      ensemble_dim <- length(ncdf4::ncvar_get(nc, "parameter"))
-    }else{
-      usethis::ui_warn("file missing parameter dimension")
-      valid <- FALSE
-    }
-    
-    #usethis::ui_todo("Checking that netcdf dimensions are correct order...")
-    dim_order <- TRUE
-    
-    if(var_dim[1] != time_dim){
-      usethis::ui_warn("time is not the first dimension")
-      valid <- FALSE
-      dim_order <- FALSE
-    }
-    
-    if(var_dim[2] != site_dim){
-      usethis::ui_warn("site is not the second dimension") 
-      valid <- FALSE
-      dim_order <- FALSE
-    }
-    
-    if(var_dim[3] != ensemble_dim){
-      usethis::ui_warn("ensemble is not the third dimension")
-      valid <- FALSE
-      dim_order <- FALSE
-    }
-    
-    if(dim_order){
-      usethis::ui_done("dimensions are correct order")
-    }
-    
-    ncdf4::nc_close(nc)
-    
-  }else if(grepl("[.]xml", file_in)){ #if file is eml
-    
-    #usethis::ui_todo("Checking validity of metdata...")
-    
-    #out <- EML::read_eml(file_in)
-    
-    #valid_metadata <- tryCatch(EFIstandards::forecast_validator(out),error = function(e){
-    #  message(e)
-    #  return(FALSE)
-    #}, 
-    #finally = NULL)
-    
-    #if(!valid_metadata){
-    #  usethis::ui_warn("metadata is not correct")
-    #  valid <- FALSE
-    #}else{
-    #  usethis::ui_done("metadata is correct")
-    #}
-    valid <- TRUE
   }else{
+    usethis::ui_warn("incorrect file extension (csv or csv.gz are accepted)")
     valid <- FALSE
   }
   if(!valid){
-    message("Forecast file is not valid. The following link provides information about the format:\nhttps://projects.ecoforecast.org/neon4cast-docs/Submission-Instructions.html")
+    message("Forecast file is not valid. The following link provides information about the format:\nhttps://projects.ecoforecast.org/neon4cast-ci/instructions.html#forecast-file-format")
   }else{
     message("Forecast format is valid")
   }
@@ -283,5 +114,4 @@ forecast_output_validator <- function(forecast_file,
 
 lexists <- function(list,name){
   any(!is.na(match(name, names(list))))
-}
-
+}
\ No newline at end of file
diff --git a/man/check_submission.Rd b/man/check_submission.Rd
deleted file mode 100644
index 614be98..0000000
--- a/man/check_submission.Rd
+++ /dev/null
@@ -1,22 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/submit.R
-\name{check_submission}
-\alias{check_submission}
-\title{Check that submission was successfully processed}
-\usage{
-check_submission(
-  forecast_file,
-  s3_region = "data",
-  s3_endpoint = "ecoforecast.org"
-)
-}
-\arguments{
-\item{forecast_file}{Your forecast csv or nc file}
-
-\item{s3_region}{subdomain (leave as is for EFI challenge)}
-
-\item{s3_endpoint}{root domain (leave as is for EFI challenge)}
-}
-\description{
-Check that submission was successfully processed
-}
diff --git a/man/forecast_output_validator.Rd b/man/forecast_output_validator.Rd
index 1e2c55f..e46fbae 100644
--- a/man/forecast_output_validator.Rd
+++ b/man/forecast_output_validator.Rd
@@ -2,31 +2,13 @@
 % Please edit documentation in R/forecast_output_validator.R
 \name{forecast_output_validator}
 \alias{forecast_output_validator}
-\title{forecast_output_validator}
+\title{Validate forecast file}
 \usage{
-forecast_output_validator(
-  forecast_file,
-  target_variables = c("oxygen", "temperature", "richness", "abundance", "nee", "le",
-    "vswc", "gcc_90", "rcc_90", "ixodes_scapularis", "amblyomma_americanum",
-    "prediction", "observed"),
-  theme_names = c("aquatics", "beetles", "phenology", "terrestrial_30min",
-    "terrestrial_daily", "ticks")
-)
+forecast_output_validator(forecast_file)
 }
 \arguments{
-\item{forecast_file}{Your forecast csv or nc file}
-
-\item{target_variables}{Possible target variables}
-
-\item{theme_names}{valid EFI theme names}
+\item{forecast_file}{forecast csv or csv.gz file}
 }
 \description{
-forecast_output_validator
-}
-\examples{
-
-forecast_file <- system.file("extdata/aquatics-2021-02-01-EFInull.csv.gz", 
-                              package = "neon4cast")
-forecast_output_validator(forecast_file)
-
+Validate forecast file
 }
diff --git a/man/noaa_stage1.Rd b/man/noaa_stage1.Rd
index bc1c7e5..42f3ddb 100644
--- a/man/noaa_stage1.Rd
+++ b/man/noaa_stage1.Rd
@@ -9,8 +9,7 @@ noaa_stage1(
   version = "v12",
   endpoint = "data.ecoforecast.org",
   verbose = TRUE,
-  start_date = "",
-  site_id = NA
+  start_date = ""
 )
 }
 \arguments{
diff --git a/man/noaa_stage2.Rd b/man/noaa_stage2.Rd
index 91ddba3..d503783 100644
--- a/man/noaa_stage2.Rd
+++ b/man/noaa_stage2.Rd
@@ -15,8 +15,7 @@ noaa_stage2(
   version = "v12",
   endpoint = NA,
   verbose = TRUE,
-  start_date = "",
-  site_id = NA
+  start_date = ""
 )
 }
 \arguments{
diff --git a/man/noaa_stage3.Rd b/man/noaa_stage3.Rd
index 8836024..41ad9bf 100644
--- a/man/noaa_stage3.Rd
+++ b/man/noaa_stage3.Rd
@@ -4,12 +4,7 @@
 \alias{noaa_stage3}
 \title{NOAA GEFS forecasts with EFI stage 3 processing}
 \usage{
-noaa_stage3(
-  version = "v12",
-  endpoint = "data.ecoforecast.org",
-  verbose = TRUE,
-  site_id = NA
-)
+noaa_stage3(version = "v12", endpoint = "data.ecoforecast.org", verbose = TRUE)
 }
 \arguments{
 \item{version}{GEFS forecast version. Prior versions correspond to forecasts
diff --git a/man/submit.Rd b/man/submit.Rd
index 297938f..608dfe0 100644
--- a/man/submit.Rd
+++ b/man/submit.Rd
@@ -13,7 +13,7 @@ submit(
 )
 }
 \arguments{
-\item{forecast_file}{Your forecast csv or nc file}
+\item{forecast_file}{forecast csv or csv.gz file}
 
 \item{metadata}{path to metadata file}