From 6d70a71c232514f23de99d41c829b4bd63f970a6 Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Fri, 2 Jul 2021 18:16:47 -0700 Subject: [PATCH] Integrate parseOSD functions --- R/create_OSD.R | 7 + .../parseOSD_functions.R | 293 +++++++++--------- 2 files changed, 153 insertions(+), 147 deletions(-) rename misc/parseOSD-migration/local_functions.R => R/parseOSD_functions.R (96%) diff --git a/R/create_OSD.R b/R/create_OSD.R index 54114e42c0..a24f85434b 100644 --- a/R/create_OSD.R +++ b/R/create_OSD.R @@ -370,8 +370,11 @@ osd_to_json <- function(logfile = file.path(output_dir, "OSD/OSD.log"), res <- sapply(1:length(all_osds), function(i) { filepath <- all_osds[[i]] logmsg(logfile, " - %s", filepath) + x <- validateOSD(logfile, filepath) + typicalpedon <- .parseTypicalPedon(x[["TYPICAL PEDON"]]) + if (is.logical(x)) if (!x) return(FALSE) @@ -389,3 +392,7 @@ osd_to_json <- function(logfile = file.path(output_dir, "OSD/OSD.log"), names(res) <- all_osds return(res) } + +.parseTypicalPedon <- function(section) { + +} \ No newline at end of file diff --git a/misc/parseOSD-migration/local_functions.R b/R/parseOSD_functions.R similarity index 96% rename from misc/parseOSD-migration/local_functions.R rename to R/parseOSD_functions.R index a6355dec52..b6ac023569 100644 --- a/misc/parseOSD-migration/local_functions.R +++ b/R/parseOSD_functions.R @@ -9,11 +9,11 @@ makeFullTextTable <- function(fullTextList, outputFile='fulltext-data.sql') { cat('DROP TABLE osd.osd_fulltext;\n', file=outputFile) cat('CREATE TABLE osd.osd_fulltext (series citext, fulltext text);\n', file=outputFile, append = TRUE) cat("set client_encoding to 'latin1' ;\n", file=outputFile, append = TRUE) - + # remove NULL elements idx <- which(! sapply(fullTextList, is.null)) fullTextList <- fullTextList[idx] - + # iterate over list elements and write to file # source text is gzip compressed n <- lapply(fullTextList, function(i) { @@ -26,7 +26,7 @@ makeFullTextTable <- function(fullTextList, outputFile='fulltext-data.sql') { # re-make sectioned fulltext table, containing an OSD per record makeFullTextSectionsTable <- function(fullTextList, outputFile='fulltext-section-data.sql') { - + # reset fulltext SQL file # need to adjust fields manually as we edit cat('DROP TABLE osd.osd_fulltext2;\n', file='fulltext-section-data.sql') @@ -48,11 +48,11 @@ established text, additional_data text );\n', file='fulltext-section-data.sql', append = TRUE) cat("set client_encoding to 'latin1' ;\n", file='fulltext-section-data.sql', append = TRUE) - + # remove NULL elements idx <- which(! sapply(fullTextList, is.null)) fullTextList <- fullTextList[idx] - + # iterate over list elements and write to file # source text is gzip compressed n <- lapply(fullTextList, function(i) { @@ -81,10 +81,10 @@ ConvertToFullTextRecord <- function(s, s.lines, tablename='osd.osd_fulltext') { ConvertToFullTextRecord2 <- function(s, s.lines, tablename='osd.osd_fulltext2') { # split sections to list, section titles hard-coded sections <- extractSections(s.lines) - + # get names of all sections st <- names(.sectionData) - + # combine sections with $$ quoting blob <- sapply(st, function(i) {paste0('$$', sections[[i]], '$$')}) res <- paste0('INSERT INTO ', tablename, ' VALUES ( $$', s, '$$, ', paste(blob, collapse = ', '), ');\n') @@ -98,70 +98,70 @@ ConvertToFullTextRecord2 <- function(s, s.lines, tablename='osd.osd_fulltext2') -downloadParseSave <- function(i) { - +downloadParseSave <- function(i, path = 'OSD') { + # get OSD from WWW pages / HTML parsing # result is a list # i.lines <- try(soilDB:::.getLiveOSD(i), silent = TRUE) - + # get from local OSDRegistry repository # result is a list i.lines <- try( - soilDB:::.getLocalOSD(i, path = 'E:/working_copies/OSDRegistry/OSD'), + soilDB:::.getLocalOSD(i, path = path), silent = TRUE ) - + # no OSD... if(class(i.lines) == 'try-error') return(FALSE) - + # output as list res <- list() - + # register section REGEX # this sets / updates a global variable setSectionREGEX(i) - - # get rendered HTML->text and save to file + + # get rendered HTML->text and save to file # store gzip-compressd OSD for bulk INSERT res[['fulltext']] <- memCompress(ConvertToFullTextRecord(i, i.lines), type='gzip') - + ## previously: # cat(i.fulltext, file = 'fulltext-data.sql', append = TRUE) - + # split data into sections for fulltext search, catch errors related to parsing sections i.sections <- try(ConvertToFullTextRecord2(i, i.lines)) if(class(i.sections) != 'try-error') { - + # store gzip-compressed sections for bulk INSERT res[['sections']] <- memCompress(i.sections, type='gzip') - + ## previously: # cat(i.sections, file = 'fulltext-section-data.sql', append = TRUE) } - - + + # append hz data to our list, catch errors related to parsing sections hz.data <- try(extractHzData(i.lines), silent = TRUE) - + # append site data to our list, catch errors related to parsing sections section.data <- try(extractSections(i.lines), silent = TRUE) site.data <- try(extractSiteData(section.data), silent = TRUE) - + # append if result was a data.frame if(class(hz.data) == 'data.frame') { # add seriesname to final column hz.data$seriesname <- i res[['hz']] <- hz.data } - + # append if result was a data.frame if(class(site.data) == 'data.frame') { # add seriesname to final column site.data$seriesname <- i res[['site']] <- site.data } - + return(res) } @@ -169,38 +169,37 @@ downloadParseSave <- function(i) { downloadParseSave.safe <- purrr::safely(downloadParseSave) - testIt <- function(x) { # get data res <- soilDB:::.getLiveOSD(x) - + # init section REGEX: critical for locating brief narrative setSectionREGEX(x) - + # extract sections l <- list() l[['sections']] <- extractSections(res) l[['section-indices']] <- findSectionIndices(res) l[['site-data']] <- extractSiteData(l[['sections']]) l[['hz-data']] <- extractHzData(res) - + return(l) } -testItLocal <- function(x, path = 'E:/working_copies/OSDRegistry/OSD') { +testItLocal <- function(x, path = 'OSD') { # get data res <- soilDB:::.getLocalOSD(x, path) - + # init section REGEX: critical for locating brief narrative setSectionREGEX(x) - + # extract sections l <- list() l[['sections']] <- extractSections(res) l[['section-indices']] <- findSectionIndices(res) l[['site-data']] <- extractSiteData(l[['sections']]) l[['hz-data']] <- extractHzData(res) - + return(l) } @@ -211,30 +210,30 @@ testItLocal <- function(x, path = 'E:/working_copies/OSDRegistry/OSD') { findClass <- function(needle, haystack) { # iterate over vector of horizon narratives, searching for exact matches test.by.hz <- lapply(haystack, stri_detect_fixed, pattern = needle, opts_fixed = list(case_insensitive=TRUE)) - - # iterate over search results by horizon, keeping names of matching classes + + # iterate over search results by horizon, keeping names of matching classes matches <- lapply(test.by.hz, function(i) { needle[i] }) - + # compute number of characters: longer matches are the most specific / correct res <- map_chr(matches, function(i) { - + # find the longest matching string idx <- which.max(nchar(i)) - + # extract it m <- i[idx] - + # convert _nothing_ into NA if(length(m) > 0) { return(m) } else { return(NA) } - + }) - + return(res) } @@ -243,64 +242,64 @@ findClass <- function(needle, haystack) { parse_texture <- function(text) { # mineral texture classes, sorted from coarse -> fine textures <- c('coarse sand', 'sand', 'fine sand', 'very fine sand', 'loamy coarse sand', 'loamy sand', 'loamy fine sand', 'loamy very fine sand', 'coarse sandy loam', 'sandy loam', 'fine sandy loam', 'very fine sandy loam', 'loam', 'silt loam', 'silt', 'sandy clay loam', 'clay loam', 'silty clay loam', 'sandy clay', 'silty clay', 'clay') - + ## TODO: this is too greedy as 'fine sand' will be found _within_ 'fine sandy loam' # https://github.com/dylanbeaudette/parse-osd/issues/10 - + # combine into capturing REGEX # texture.regex <- paste0('(', paste(textures, collapse='|'), ')') - # + # # # get matches # m <- stri_match(text, regex = texture.regex, mode='first', opts_regex=list(case_insensitive=TRUE)) - # + # # # fail gracefully in the case of no section data or no matches # if(nrow(m) < 1) # return(NA) - # + # # # keep only matches and convert to lower case # m <- tolower(m[, 2]) - + ## 2019-05-29: generalized for all non-greedy, exact matching m <- findClass(needle=textures, haystack=text) m <- tolower(m) - + # convert to ordered factor m <- factor(m, levels=textures, ordered = TRUE) - + return(m) } # vectorized parsing of horizon boundary parse_hz_boundary <- function(text) { - + distinctness <- c('very abrupt', 'abrupt', 'clear', 'gradual', 'diffuse') topography <- c('smooth', 'wavy', 'irregular', 'broken') bdy <- apply(expand.grid(distinctness, topography), 1, paste, collapse=' ') - - + + ## TODO: this is too greedy ? # https://github.com/dylanbeaudette/parse-osd/issues/10 - + # combine into capturing REGEX bdy.regex <- paste0('(', paste(bdy, collapse='|'), ') boundary') - + # get matches m <- stri_match(text, regex = bdy.regex, mode='first', opts_regex=list(case_insensitive=TRUE)) - + # fail gracefully in the case of no section data or no matches if(nrow(m) < 1) return(NA) - + # keep only matches and convert to lower case m <- tolower(m[, 2]) - + # split into pieces res <- data.frame( distinctness = findClass(needle=distinctness, haystack=m), topography = findClass(needle=topography, haystack=m), stringsAsFactors = FALSE ) - + return(res) } @@ -309,91 +308,91 @@ parse_CF <- function(text) { cf.type <- c('gravelly', 'cobbly', 'stony', 'bouldery', 'channery', 'flaggy') cf.qty <- c('', 'very ', 'extremely ') cf <- apply(expand.grid(cf.qty, cf.type), 1, paste, collapse='') - - + + ## TODO: this is too greedy as 'fine sand' will be found _within_ 'fine sandy loam' # https://github.com/dylanbeaudette/parse-osd/issues/10 - + # combine into capturing REGEX cf.regex <- paste0('(', paste(cf, collapse='|'), ')') - + # get matches m <- stri_match(text, regex = cf.regex, mode='first', opts_regex=list(case_insensitive=TRUE)) - + # fail gracefully in the case of no section data or no matches if(nrow(m) < 1) return(NA) - + # keep only matches and convert to lower case m <- tolower(m[, 2]) - + return(m) } # vectorized parsing of pH parse_pH <- function(text) { - + # combine into capturing REGEX ph.regex <- '\\(ph\\s?([0-9]\\.[0-9])\\)' - + # get matches m <- stri_match(text, regex = ph.regex, mode='first', opts_regex=list(case_insensitive=TRUE)) - + # fail gracefully in the case of no section data or no matches if(nrow(m) < 1) return(NA) - + # keep only matches m <- as.numeric(m[, 2]) - + return(m) } # vectorized parsing of pH class parse_pH_class <- function(text) { - + # mineral texture classes pH_classes <- c('ultra acid', 'extremely acid', 'very strongly acid', 'strongly acid', 'moderately acid', 'slightly acid', 'neutral', 'slightly alkaline', 'mildly alkaline', 'moderately alkaline', 'strongly alkaline', 'very strongly alkaline') - + ## 2019-05-29: generalized for all non-greedy, exact matching m <- findClass(needle=pH_classes, haystack=text) m <- tolower(m) - + # return as an ordered factor acidic -> basic m <- factor(m, levels=pH_classes, ordered = TRUE) - + return(m) } # vectorized parsing of drainage class parse_drainage_class <- function(text) { - + # drainage classes, in order, lower case - classes <- c("excessively", "somewhat excessively", "well", "moderately well", + classes <- c("excessively", "somewhat excessively", "well", "moderately well", "somewhat poorly", "poorly", "very poorly", "subaqueous") - + ## TODO: this is too greedy as 'fine sand' will be found _within_ 'fine sandy loam' # https://github.com/dylanbeaudette/parse-osd/issues/10 - - + + # combine into capturing REGEX classes.regex <- paste0('(', paste(classes, collapse='|'), ')') - + # get matches m <- stri_match(text, regex = classes.regex, mode='first', opts_regex=list(case_insensitive=TRUE)) - + # fail gracefully in the case of no section data or no matches if(nrow(m) < 1) return(NA) - + # keep only matches and convert to lower case m <- tolower(m[, 2]) - + # return as an ordered factor acidic -> basic m <- factor(m, levels=classes, ordered = TRUE) - + return(m) } @@ -409,7 +408,7 @@ setSectionREGEX <- function(s) { ## temporary hack: storing as a global variable # values are REGEX that try to accommodate typos # names are the proper section names - + ## TODO: # consider anchoring all to left-side + optional white-space # "TYPICAL PEDON" REGEX is too greedy @@ -430,7 +429,7 @@ setSectionREGEX <- function(s) { 'SERIES ESTABLISHED'='^\\s*SERIES ESTABLISHED[:]? ', 'ADDITIONAL DATA'='^\\s*ADDITIONAL DATA[:]? ' ) - + } @@ -441,17 +440,17 @@ setSectionREGEX <- function(s) { findSectionIndices <- function(chunk.lines) { # result is a list, sometimes a section REGEX will match multiple lines s <- lapply(.sectionData, function(st) grep(st, chunk.lines, ignore.case = TRUE)) - + # filter out sections with no matches match.idx <- which(sapply(s, function(i) length(i) > 0)) s <- s[match.idx] - + # keep the first match indices <- sapply(s, function(i) i[1]) - + # get set of section names with parsed data section.names <- names(indices) - + return(indices) } @@ -459,42 +458,42 @@ findSectionIndices <- function(chunk.lines) { extractSections <- function(chunk.lines, collapseLines=TRUE) { # storage l <- list() - + # locate section lines # note: this will give values inclusive of the next section section.locations <- findSectionIndices(chunk.lines) section.names <- names(section.locations) - + # combine chunks into a list for(i in 1:(length(section.locations) - 1)) { - + # current name and landmarks this.name <- section.names[i] start.line <- section.locations[i] - + # this stop line overlaps with the start of the next, decrease index by 1 stop.line <- section.locations[i+1] - 1 - + # extract current chunk chunk <- chunk.lines[start.line : stop.line] - + # special case #1: the brief narrative is split over two lines; first line is junk if(this.name == 'BRIEF DESCRIPTION' & length(chunk) > 1) { # attempt to remove when possible chunk <- chunk[-1] } - + # optionally combine lines if(collapseLines) chunk <- paste(chunk, collapse='') - + # attempt to remove section name chunk <- gsub(this.name, '', chunk) - + # store l[[this.name]] <- chunk } - + return(l) } @@ -506,22 +505,22 @@ extractSections <- function(chunk.lines, collapseLines=TRUE) { # parse important pieces from sections # x: list of section chunks extractSiteData <- function(x) { - + ## drainage class - + # this work for standard OSD format drainage.class <- parse_drainage_class(x[['DRAINAGE AND PERMEABILITY']]) - + # alternative for SSR1 updated OSD format # https://casoilresource.lawr.ucdavis.edu/sde/?series=bordengulch if(is.na(drainage.class)) { drainage.class <- parse_drainage_class(x[['BRIEF DESCRIPTION']]) } - - + + ## other things? - - + + # composite into a list for later r <- data.frame(drainage=drainage.class) return(r) @@ -529,15 +528,15 @@ extractSiteData <- function(x) { # s.lines: result of getOSD() -extractHzData <- function(s.lines) { +.extractHzData <- function(tp) { options(stringsAsFactors=FALSE) - + ## TODO: this is kind of wasteful # this will not work in the presence of typos # new code for splitting blocks by section, lines from each section are not joined - sections <- extractSections(s.lines, collapseLines = FALSE) - tp <- sections[['TYPICAL PEDON']] - + # sections <- extractSections(s.lines, collapseLines = FALSE) + # tp <- sections[['TYPICAL PEDON']] + ## REGEX rules # http://regexr.com/ ## TODO: combine top+bottom with top only rules @@ -548,64 +547,64 @@ extractHzData <- function(s.lines) { # detect horizons with both top and bottom depths # hz.rule <- "^\\s*?([\\^\\'\\/a-zA-Z0-9]+)\\s?-+?\\s?([O0-9.]+)\\s+?to\\s+?([O0-9.]+)\\s+?(in|inches|cm|centimeters)" hz.rule <- "([\\^\\'\\/a-zA-Z0-9]+)\\s*-+\\s*([O0-9.]+)\\s*?to\\s+?([O0-9.]+)\\s+?(in|inches|cm|centimeters)" - + # detect horizons with no bottom depth hz.rule.no.bottom <- "([\\^\\'\\/a-zA-Z0-9]+)\\s*-+?\\s*([0-9.]+)\\s+?(in|inches|cm|centimeters)" - - + + ## TODO: this doesn't work when only moist colors are specified (http://casoilresource.lawr.ucdavis.edu/sde/?series=canarsie) ## TODO: these rules will not match neutral colors: N 2.5/ ## TODO: toggle dry/moist assumption: ## ## Colors are for dry soil unless otherwise stated | Colors are for moist soil unless otherwise stated - ## + ## ## E1--7 to 12 inches; very dark gray (10YR 3/1) silt loam, 50 percent gray (10YR 5/1) and 50 percent gray (10YR 6/1) dry; moderate thin platy structure parting to weak thin platy; friable, soft; common fine and medium roots throughout; common fine tubular pores; few fine distinct dark yellowish brown (10YR 4/6) friable masses of iron accumulations with sharp boundaries on faces of peds; strongly acid; clear wavy boundary. - + ## A--0 to 6 inches; light gray (10YR 7/2) loam, dark grayish brown (10YR 4/2) moist; moderate coarse subangular blocky structure; slightly hard, friable, slightly sticky and slightly plastic; many very fine roots; many very fine and few fine tubular and many very fine interstitial pores; 10 percent pebbles; strongly acid (pH 5.1); clear wavy boundary. (1 to 8 inches thick) ## - + ## TODO: test this # establist default encoding of colors dry.is.default <- length(grep('for dry (soil|conditions)', tp, ignore.case = TRUE)) > 0 moist.is.default <- length(grep('for moist (soil|conditions)', tp, ignore.case = TRUE)) > 0 - + if(dry.is.default) default.moisture.state <- 'dry' if(moist.is.default) default.moisture.state <- 'moist' - + # if neither are specified assume moist conditions if((!dry.is.default & !moist.is.default)) default.moisture.state <- 'moist' - + # if both are specified (?) if(dry.is.default & moist.is.default) default.moisture.state <- 'unknown' - + ## TODO: test this # get all colors matching our rule, moist and dry and unknown, 5th column is moisture state # interpretation is tough when multiple colors / hz are given # single rule, with dry/moist state # note that dry/moist may not always be present color.rule <- "\\(([0-9]?[\\.]?[0-9]?[B|G|Y|R|N]+)([ ]+?[0-9\\.]+)/([0-9])\\)\\s?(dry|moist|)" - + # detect moist and dry colors dry.color.rule <- "\\(([0-9]?[\\.]?[0-9]?[B|G|Y|R|N]+)([ ]+?[0-9\\.]+)/([0-9])\\)(?! moist)" moist.color.rule <- "\\(([0-9]?[\\.]?[0-9]?[B|G|Y|R|N]+)([ ]+?[0-9\\.]+)/([0-9])\\) moist" - + # ID actual lines of horizon information hz.idx <- unique(c(grep(hz.rule, tp), grep(hz.rule.no.bottom, tp))) - + # init empty lists to store hz data and colors hz.data <- list() dry.colors <- list() moist.colors <- list() narrative.data <- list() - + # iterate over identified horizons, extracting hz parts for(i in seq_along(hz.idx)) { this.chunk <- tp[hz.idx[i]] - + # parse hz designations and depths, keep first match ## hack # first try to find horizons with top AND bottom depths @@ -617,86 +616,86 @@ extractHzData <- function(s.lines) { h <- c(h, h[4]) # move units to 5th element h[4] <- NA # add fake missing bottom depth } - + # save hz data to list hz.data[[i]] <- h - + # save narrative to list narrative.data[[i]] <- this.chunk - + ## TODO: test this! # parse ALL colors, result is a multi-row matrix, 5th column is moisture state colors <- stri_match_all(this.chunk, regex=color.rule)[[1]] # replace missing moisture state with (parsed) default value colors[, 5][which(colors[, 5] == '')] <- default.moisture.state - + # exctract dry|moist colors, note that there may be >1 color per state dc <- colors[which(colors[, 5] == 'dry'), 1:4, drop=FALSE] mc <- colors[which(colors[, 5] == 'moist'), 1:4, drop=FALSE] - + # there there was at least 1 match, keep the first 1 if(nrow(dc) > 0) dry.colors[[i]] <- dc[1, ] else dry.colors[[i]] <- matrix(rep(NA, times=4), nrow = 1) - + if(nrow(mc) > 0) moist.colors[[i]] <- mc[1, ] else moist.colors[[i]] <- matrix(rep(NA, times=4), nrow = 1) } - + # test for no parsed data, must be some funky formatting... if(length(hz.data) == 0) return(NULL) - + # convert to DF hz.data <- ldply(hz.data)[2:5] dry.colors <- ldply(dry.colors)[2:4] moist.colors <- ldply(moist.colors)[2:4] narrative.data <- ldply(narrative.data) - + names(hz.data) <- c('name', 'top', 'bottom', 'units') names(dry.colors) <- c('dry_hue', 'dry_value', 'dry_chroma') names(moist.colors) <- c('moist_hue', 'moist_value', 'moist_chroma') names(narrative.data) <- c('narrative') - + # cast to proper data types hz.data$top <- as.numeric(hz.data$top) hz.data$bottom <- as.numeric(hz.data$bottom) - + dry.colors$dry_value <- as.numeric(dry.colors$dry_value) dry.colors$dry_chroma <- as.numeric(dry.colors$dry_chroma) - + moist.colors$moist_value <- as.numeric(moist.colors$moist_value) moist.colors$moist_chroma <- as.numeric(moist.colors$moist_chroma) - + ## TODO sanity check / unit reporting: this will fail when formatting is inconsistent (PROPER series) # convert in -> cm using the first horizon if(hz.data$units[1] %in% c('inches', 'in')) { hz.data$top <- round(hz.data$top * 2.54) hz.data$bottom <- round(hz.data$bottom * 2.54) } - + # remove units column hz.data$units <- NULL - + # combine into single DF res <- cbind(hz.data, dry.colors, moist.colors) - + # parse out other elements from the narrative res$texture_class <- parse_texture(narrative.data$narrative) res$cf_class <- parse_CF(narrative.data$narrative) res$pH <- parse_pH(narrative.data$narrative) res$pH_class <- parse_pH_class(narrative.data$narrative) - + bdy <- parse_hz_boundary(narrative.data$narrative) res$distinctness <- bdy$distinctness res$topography <- bdy$topography - + # add narrative res <- cbind(res, narrative.data) - + return(res) }