From 06f4e9605d9e2bdc7767f28d4618aa01227896cf Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Mon, 12 Jul 2021 11:33:03 -0700 Subject: [PATCH] get_OSD: TXT and HTML formats and unified interface --- R/OSD-file-access.R | 76 -------------- R/get_OSD.R | 157 +++++++++++++++++++++++++++++ R/get_OSD_JSON.R | 75 -------------- man/get_OSD.Rd | 43 ++++++++ man/get_OSD_JSON.Rd | 33 ------ tests/testthat/test-get_OSD.R | 19 ++++ tests/testthat/test-get_OSD_JSON.R | 15 --- 7 files changed, 219 insertions(+), 199 deletions(-) delete mode 100644 R/OSD-file-access.R create mode 100644 R/get_OSD.R delete mode 100644 R/get_OSD_JSON.R create mode 100644 man/get_OSD.Rd delete mode 100644 man/get_OSD_JSON.Rd create mode 100644 tests/testthat/test-get_OSD.R delete mode 100644 tests/testthat/test-get_OSD_JSON.R diff --git a/R/OSD-file-access.R b/R/OSD-file-access.R deleted file mode 100644 index 8ed6ca3d..00000000 --- a/R/OSD-file-access.R +++ /dev/null @@ -1,76 +0,0 @@ -## Utility functions for getting / prepping live (HTML) or local (TXT, OSDRegistry) OSDs. -## Migrated / adapted from parseOSD repo -## 2021-06-21 -## D.E. Beaudette - - -# generate a link to the "live" HTML OSDs for a single soil series name -.seriesNameToURL <- function(s) { - base.url <- 'http://soilseriesdesc.sc.egov.usda.gov/OSD_Docs/' - s <- toupper(s) - # convert space to _ - s <- gsub(pattern = ' ', replacement = '_', s, fixed = TRUE) - # TODO: convert apostrophe - - # final URL - u <- paste0(base.url, substr(s, 1, 1), '/', s, '.html') - return(u) -} - -# generate a link to local (TXT) OSDs, typically provided by OSDRegistry -.seriesNameToFileName <- function(s) { - s <- toupper(s) - # convert space to _ - s <- gsub(pattern = ' ', replacement = '_', s) - # first-letter indexing - u <- sprintf('%s/%s.%s', substr(s, 1, 1), s, 'txt') - return(u) -} - -# remove blank lines from HTML text -.removeBlankLines <- function(chunk) { - # extract lines and remove blank / NA lines - chunk.lines <- readLines(textConnection(chunk)) - chunk.lines <- chunk.lines[which(chunk.lines != '')] - chunk.lines <- chunk.lines[which(!is.na(chunk.lines))] - return(chunk.lines) -} - -# get "live" OSD from HTML record, convert to lines of text (HTML stripped) -# s: a soil series name -.getLiveOSD <- function(s) { - - # sanity check - if( !requireNamespace('rvest', quietly=TRUE)) - stop('please install the `rvest` package', call.=FALSE) - - # make URL - u <- .seriesNameToURL(s) - # get HTML content and strip blank / NA lines - s.html.text <- rvest::html_text(rvest::read_html(u)) - s.html.text <- .removeBlankLines(s.html.text) - # strip double quotes by converting to " inches" - s.html.text <- gsub('"', ' inches', s.html.text) - # done - return(s.html.text) -} - - -# get local (TXT) OSD, typically from OSDRegistry working copy -.getLocalOSD <- function(s, path = '') { - # file name - fn <- .seriesNameToFileName(s) - # local path - fp <- file.path(path, fn) - # load text lines - s.text <- readLines(fp) - # remove blank lines - s.text <- s.text[which(s.text != '')] - s.text <- s.text[which(!is.na(s.text))] - # strip double quotes by converting to " inches" - s.text <- gsub('"', ' inches', s.text) - # done - return(s.text) -} - - diff --git a/R/get_OSD.R b/R/get_OSD.R new file mode 100644 index 00000000..bcb4c602 --- /dev/null +++ b/R/get_OSD.R @@ -0,0 +1,157 @@ +#' Fetch Official Series Description Data from JSON, HTML or TXT sources +#' +#' @param series A character vector of Official Series names e.g. `"Chewacla"` +#' @param result Select `"json"`, `"html"`, or `"txt"` output +#' +#' @param base_url Optional: alternate JSON/HTML/TXT repository path. Default: `NULL` uses `"https://github.com/ncss-tech/SoilKnowledgeBase"` for `result="json"` +#' +#' @param verbose Print errors and warning messages related to HTTP requests? Default: `FALSE` +#' +#' @details The default `base_url` for `result="json"` is to JSON files stored in a GitHub repository that is regularly updated from the official source of Series Descriptions. Using format: `https://raw.githubusercontent.com/ncss-tech/SoilKnowledgeBase/main/inst/extdata/OSD/{LETTER}/{SERIES}.json` for JSON. And `"https://soilseriesdesc.sc.egov.usda.gov/OSD_Docs/{LETTER}/{SERIES}.html` is for `result="html"` (official source). +#' +#' @return For JSON result: A `data.frame` with 1 row per series, and 1 column per "section" in the OSD as defined in National Soil Survey Handbook. For TXT or HTML result a list of character vectors containing OSD text with 1 element per series and one value per line. +#' @export +#' @aliases get_OSD_JSON +#' @examples +#' +#' \donttest{ +#' if(requireNamespace("curl") & +#' curl::has_internet()) { +#' +#' series <- c("Musick", "Hector", "Chewacla") +#' get_OSD(series) +#' } +#' } +get_OSD <- function(series, base_url = NULL, result = c("json","html","txt"), verbose = FALSE) { + result <- match.arg(tolower(result), c("json","html","txt")) + + a_url <- NULL + if (!is.null(base_url)) { + a_url <- base_url + } + + switch(result, + "json" = { .get_OSD_JSON(series, base_url = a_url, verbose = verbose) }, + "html" = { .get_OSD_HTML(series, base_url = a_url, verbose = verbose) }, + "txt" = { .get_OSD_TXT(series, verbose = verbose) }) +} + +.get_OSD_HTML <- function(series, base_url = NULL, verbose = FALSE) { + if(!requireNamespace('rvest', quietly=TRUE)) + stop('please install the `rvest` package', call.=FALSE) + + if (missing(base_url) || is.null(base_url)) + base_url <- 'https://soilseriesdesc.sc.egov.usda.gov/OSD_Docs/' + + # get HTML content and strip blank / NA lines + res <- sapply(.seriesNameToURL(series, base_url = base_url), function(x) { + htmlres <- try(rvest::html_text(rvest::read_html(x)), silent = !verbose) + + if (inherits(htmlres, 'try-error')) + return(NULL) + + .stripOSDContents(readLines(textConnection(htmlres))) + }) + names(res) <- toupper(series) + res +} + +.get_OSD_TXT <- function(series, base_url = "", verbose = FALSE) { + sapply(series, function(x) { + fp <- .seriesNameToURL(x, base_url = base_url, extension = "txt") + + if (!file.exists(fp)) + return(NULL) + + # remove empty lines and fix other markup + try(.stripOSDContents(readLines(fp)), silent = !verbose) + }) +} + +get_OSD_JSON <- function(series, base_url = NULL) { + # .Deprecated("get_OSD") + .get_OSD_JSON(series, base_url) +} + +.get_OSD_JSON <- function(series, base_url = NULL, verbose = FALSE) { + + # http://github.com/ncss-tech/SoilKnowledgeBase is default JSON repository path + if (missing(base_url) || is.null(base_url)) + base_url <- "https://raw.githubusercontent.com/ncss-tech/SoilKnowledgeBase/main/inst/extdata/OSD" + + if (!requireNamespace("jsonlite")) + stop("package `jsonlite` is required", call. = FALSE) + + # convert series name to upper case and remove NA + series <- toupper(na.omit(series)) + + # get first letter of each taxon (if any) + if (length(series) > 0 && all(nchar(series) > 1)) { + firstLetter <- substr(series, 0, 1) + } else stop("argument `series` should be character vector of existing official series names", call. = FALSE) + + # construct URL + path <- file.path(base_url, firstLetter, paste0(series, ".json")) + + # query, handle errors, return 'tidy' data.frame result + data.frame(data.table::rbindlist(lapply(seq_along(path), function(i) { + + p <- path[i] + jsp <- try(jsonlite::read_json(p), silent = TRUE) + + # warning will be generated for non-existent URL + if (inherits(jsp, 'try-error')) + return(NULL) + + jspn <- names(jsp)[!names(jsp) %in% c('SITE','HORIZONS')] + res <- try({ + data.table::as.data.table(lapply(jspn, function(m) { + x <- jsp[[m]] + res2 <- x[[length(x)]] + if (is.null(res2)) + res2 <- NA + res2 + })) + }, silent = FALSE) + colnames(res) <- jspn + + jsp$SITE[[1]][[1]]$id <- i + res$SITE <- list(data.frame(data.table::rbindlist(lapply(jsp$SITE[[1]], data.frame), fill = TRUE))) + res$HORIZONS <- list(data.frame(data.table::rbindlist(lapply(jsp$HORIZONS[[1]], data.frame), fill = TRUE))) + + # handles weird cases + if (inherits(res, 'try-error')) + return(NULL) + + return(res) + }))) + +} + +## Migrated / adapted from parseOSD repo +## 2021-06-21 +## D.E. Beaudette + +# generate a link to the OSD for a vector of series names +.seriesNameToURL <- function(s, base_url = 'http://soilseriesdesc.sc.egov.usda.gov/OSD_Docs/', + extension = 'html') { + paste0(base_url, .seriesNameToFileName(s, extension = extension)) +} + +# prepare a file name and capitalized-first-letter folder based on a series name +.seriesNameToFileName <- function(s, extension = 'txt') { + + # convert space to _ + s <- gsub(pattern = ' ', replacement = '_', toupper(s)) + + # TODO: convert apostrophe + + sprintf('%s/%s.%s', substr(s, 1, 1), s, extension) +} + + # remove empty lines and NA strip double quotes by converting to " inches" +.stripOSDContents <- function(x) { + x <- x[which(x != '')] + x <- x[which(!is.na(x))] + gsub('"', ' inches', x) +} diff --git a/R/get_OSD_JSON.R b/R/get_OSD_JSON.R deleted file mode 100644 index dcd8286d..00000000 --- a/R/get_OSD_JSON.R +++ /dev/null @@ -1,75 +0,0 @@ -#' Fetch Official Series Description Raw Data in JSON format -#' -#' @param series A character vector of Official Series names e.g. `"Chewacla"` -#' @param base_url Optional: alternate JSON repository path. Default: `NULL` uses \url{https://github.com/ncss-tech/SoilKnowledgeBase}. -#' -#' @details The default `base_url` is to "raw" JSON files stored in a GitHub repository that is regularly updated from the official source of Series Descriptions. Using format: `https://raw.githubusercontent.com/ncss-tech/SoilKnowledgeBase/main/inst/extdata/OSD/{LETTER}/{SERIES}.json` -#' -#' @return A `data.frame` with 1 row per series, and 1 column per "section" in the OSD as defined in National Soil Survey Handbook. Includes two list columns containing _data.frame_ `SITE` and `HORIZONS` parsed from each Typical Pedon narrative. -#' @export -#' -#' @examples -#' -#' \donttest{ -#' if(requireNamespace("curl") & -#' curl::has_internet()) { -#' -#' series <- c("Musick", "Hector", "Chewacla") -#' get_OSD_JSON(series) -#' } -#' } -get_OSD_JSON <- function(series, - base_url = NULL) { - - # http://github.com/ncss-tech/SoilKnowledgeBase is default JSON repository path - if (missing(base_url) || is.null(base_url)) - base_url <- "https://raw.githubusercontent.com/ncss-tech/SoilKnowledgeBase/main/inst/extdata/OSD" - - if (!requireNamespace("jsonlite")) - stop("package `jsonlite` is required", call. = FALSE) - - # convert series name to upper case and remove NA - series <- toupper(na.omit(series)) - - # get first letter of each taxon (if any) - if (length(series) > 0 && all(nchar(series) > 1)) { - firstLetter <- substr(series, 0, 1) - } else stop("argument `series` should be character vector of existing official series names", call. = FALSE) - - # construct URL - path <- file.path(base_url, firstLetter, paste0(series, ".json")) - - # query, handle errors, return 'tidy' data.frame result - data.frame(data.table::rbindlist(lapply(seq_along(path), function(i) { - - p <- path[i] - jsp <- try(jsonlite::read_json(p), silent = TRUE) - - # warning will be generated for non-existent URL - if (inherits(jsp, 'try-error')) - return(NULL) - - jspn <- names(jsp)[!names(jsp) %in% c('SITE','HORIZONS')] - res <- try({ - data.table::as.data.table(lapply(jspn, function(m) { - x <- jsp[[m]] - res2 <- x[[length(x)]] - if (is.null(res2)) - res2 <- NA - res2 - })) - }, silent = FALSE) - colnames(res) <- jspn - - jsp$SITE[[1]][[1]]$id <- i - res$SITE <- list(data.frame(data.table::rbindlist(lapply(jsp$SITE[[1]], data.frame), fill = TRUE))) - res$HORIZONS <- list(data.frame(data.table::rbindlist(lapply(jsp$HORIZONS[[1]], data.frame), fill = TRUE))) - - # handles weird cases - if (inherits(res, 'try-error')) - return(NULL) - - return(res) - }))) - -} diff --git a/man/get_OSD.Rd b/man/get_OSD.Rd new file mode 100644 index 00000000..7e3e2883 --- /dev/null +++ b/man/get_OSD.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_OSD.R +\name{get_OSD} +\alias{get_OSD} +\alias{get_OSD_JSON} +\title{Fetch Official Series Description Data from JSON, HTML or TXT sources} +\usage{ +get_OSD( + series, + base_url = NULL, + result = c("json", "html", "txt"), + verbose = FALSE +) +} +\arguments{ +\item{series}{A character vector of Official Series names e.g. \code{"Chewacla"}} + +\item{base_url}{Optional: alternate JSON/HTML/TXT repository path. Default: \code{NULL} uses \code{"https://github.com/ncss-tech/SoilKnowledgeBase"} for \code{result="json"}} + +\item{result}{Select \code{"json"}, \code{"html"}, or \code{"txt"} output} + +\item{verbose}{Print errors and warning messages related to HTTP requests? Default: \code{FALSE}} +} +\value{ +For JSON result: A \code{data.frame} with 1 row per series, and 1 column per "section" in the OSD as defined in National Soil Survey Handbook. For TXT or HTML result a list of character vectors containing OSD text with 1 element per series and one value per line. +} +\description{ +Fetch Official Series Description Data from JSON, HTML or TXT sources +} +\details{ +The default \code{base_url} for \code{result="json"} is to JSON files stored in a GitHub repository that is regularly updated from the official source of Series Descriptions. Using format: \verb{https://raw.githubusercontent.com/ncss-tech/SoilKnowledgeBase/main/inst/extdata/OSD/\{LETTER\}/\{SERIES\}.json} for JSON. And \verb{"https://soilseriesdesc.sc.egov.usda.gov/OSD_Docs/\{LETTER\}/\{SERIES\}.html} is for \code{result="html"} (official source). +} +\examples{ + +\donttest{ +if(requireNamespace("curl") & + curl::has_internet()) { + +series <- c("Musick", "Hector", "Chewacla") +get_OSD(series) +} +} +} diff --git a/man/get_OSD_JSON.Rd b/man/get_OSD_JSON.Rd deleted file mode 100644 index f380a33a..00000000 --- a/man/get_OSD_JSON.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_OSD_JSON.R -\name{get_OSD_JSON} -\alias{get_OSD_JSON} -\title{Fetch Official Series Description Raw Data in JSON format} -\usage{ -get_OSD_JSON(series, base_url = NULL) -} -\arguments{ -\item{series}{A character vector of Official Series names e.g. \code{"Chewacla"}} - -\item{base_url}{Optional: alternate JSON repository path. Default: \code{NULL} uses \url{https://github.com/ncss-tech/SoilKnowledgeBase}.} -} -\value{ -A \code{data.frame} with 1 row per series, and 1 column per "section" in the OSD as defined in National Soil Survey Handbook. Includes two list columns containing \emph{data.frame} \code{SITE} and \code{HORIZONS} parsed from each Typical Pedon narrative. -} -\description{ -Fetch Official Series Description Raw Data in JSON format -} -\details{ -The default \code{base_url} is to "raw" JSON files stored in a GitHub repository that is regularly updated from the official source of Series Descriptions. Using format: \verb{https://raw.githubusercontent.com/ncss-tech/SoilKnowledgeBase/main/inst/extdata/OSD/\{LETTER\}/\{SERIES\}.json} -} -\examples{ - -\donttest{ -if(requireNamespace("curl") & - curl::has_internet()) { - -series <- c("Musick", "Hector", "Chewacla") -get_OSD_JSON(series) -} -} -} diff --git a/tests/testthat/test-get_OSD.R b/tests/testthat/test-get_OSD.R new file mode 100644 index 00000000..6fabb69b --- /dev/null +++ b/tests/testthat/test-get_OSD.R @@ -0,0 +1,19 @@ +test_that("get_OSD works", { + + skip_if_offline() + + skip_on_cran() + + series <- c("Musick", "Hector", NA, "Foobar", "Chewacla") + + # warning: F/FOOBAR.json does not exist + expect_warning({res1 <- get_OSD(series)}) + + # data.frame result w/ 3 existing official series + expect_equal(nrow(res1), 3) + + # same 404 messages with list result="html" output + res2 <- get_OSD(series, result = "html") + expect_equal(length(res2), 5) + expect_null(res2[[4]]) +}) diff --git a/tests/testthat/test-get_OSD_JSON.R b/tests/testthat/test-get_OSD_JSON.R deleted file mode 100644 index 52141994..00000000 --- a/tests/testthat/test-get_OSD_JSON.R +++ /dev/null @@ -1,15 +0,0 @@ -test_that("get_OSD_JSON works", { - - skip_if_offline() - - skip_on_cran() - - series <- c("Musick", "Hector", NA, "Foobar", "Chewacla") - - # warning: F/FOOBAR.json does not exist - expect_warning({res <- get_OSD_JSON(series)}) - - # data.frame result w/ 3 existing official series - expect_equal(nrow(res), 3) - -})