updated fulltext search source data to use get_OSD output

ncss-tech · Feb 15, 2022 · 7dc67f1 · 7dc67f1
1 parent b201838
commit 7dc67f1
Show file tree

Hide file tree

Showing 3 changed files with 177 additions and 61 deletions.
diff --git a/misc/parseOSD-dev/create-SoilWeb-OSD-data.R b/misc/parseOSD-dev/create-SoilWeb-OSD-data.R
@@ -5,6 +5,11 @@ library(soilDB)
 library(data.table)
 library(progress)
 
+
+## temporary parking place for full text processing functions
+source('fulltext-to-merge.R')
+
+
 # recent SC database
 sc <- fread('https://github.com/ncss-tech/SoilWeb-data/raw/main/files/SC-database.csv.gz')
 sc <- as.data.frame(sc)
@@ -33,6 +38,11 @@ idx <- 1:length(sc)
 hz.data <- list()
 site.data <- list()
 missing.file <- list()
+fulltext.records <- list()
+
+# section names we will be extracting for SoilWeb / NASIS
+section.names <- c("OVERVIEW", "TAXONOMIC.CLASS", "TYPICAL.PEDON", "TYPE.LOCATION", "RANGE.IN.CHARACTERISTICS", "COMPETING.SERIES", "GEOGRAPHIC.SETTING", "GEOGRAPHICALLY.ASSOCIATED.SOILS", "DRAINAGE.AND.PERMEABILITY", "USE.AND.VEGETATION", "DISTRIBUTION.AND.EXTENT", "REMARKS", "ORIGIN", "ADDITIONAL.DATA")
+
 
 ## TODO: convert this to furrr / parallel processing
 
@@ -51,6 +61,8 @@ for(i in sc[idx]) {
   # * some series in SC may not exist here
   # * these files may contain data.frames of varying structure
   osddf <- get_OSD(i, result = 'json', base_url = osd.path)
+
+  # typical pedon section, already broken into pieces
   hz <- osddf[['HORIZONS']][[1]]
   s <- osddf[['SITE']][[1]]
 
@@ -60,6 +72,7 @@ for(i in sc[idx]) {
     next
   }
 
+  ## horizon data
   # file exists, but perhaps nothing was extracted... why?
   if(inherits(hz, 'data.frame')) {
     if(nrow(hz) > 0) {
@@ -71,6 +84,7 @@ for(i in sc[idx]) {
     }
   }
 
+  ## site data
   # columns should contain: drainage, drainage_overview, id
   if(inherits(s, 'data.frame')) {
     # remove 'id' column
@@ -90,19 +104,43 @@ for(i in sc[idx]) {
     site.data[[i]] <- s
   }
 
+  ## attempt narrative chunks
+  .narratives <- list()
+  for(sn in section.names) {
+    .text <- osddf[[sn]]
+
+    # remove section title, not present in all sections
+    .text <- gsub(pattern = "^[a-zA-Z1-9 ]+\\s?:\\s*", replacement = "", x = .text)
+
+    # convert NA -> ''
+
+    # pack into a list for later
+    .narratives[[sn]] <- .text
+  }
+
+  ## store the section fulltext INSERT statements
+  # with compression: 61 MB
+  # without compression: 232 MB
+  fulltext.records[[i]] <- memCompress(
+    .ConvertToFullTextRecord2(s = i, narrativeList = .narratives),
+    type = 'gzip'
+  )
+
 }
 
 
 pb$terminate()
 
 
+
+
+
 ## flatten
 # missing files: likely old / retired OSDs
 missing.file <- as.vector(do.call('c', missing.file))
 length(missing.file)
 
-# horizon data: may not share the same column-ordering
-# issue: https://github.com/ncss-tech/SoilKnowledgeBase/issues/35
+## horizon data: may not share the same column-ordering
 hz <- as.data.frame(rbindlist(hz.data, fill = TRUE))
 
 # re-order
@@ -113,14 +151,18 @@ vars <- c("name", "top", "bottom", "dry_hue", "dry_value", "dry_chroma",
 
 hz <- hz[, vars]
 
-# site data, all items should be conformal
+## site data, all items should be conformal
 s <- as.data.frame(rbindlist(site.data))
 
 
-# save
+## save hz + site data
 write.csv(hz, file = gzfile('parsed-data.csv.gz'), row.names = FALSE)
 write.csv(s, file = gzfile('parsed-site-data.csv.gz'), row.names = FALSE)
 
+## re-make section fulltext table + INSERT statements
+system.time(.makeFullTextSectionsTable(fulltext.records))
+
+
 ## TODO: investigate missing records, relative to the last time this was run
 # nrow(read.csv('E:/working_copies/parse-osd/R/parsed-data.csv.gz'))
 # nrow(read.csv('E:/working_copies/parse-osd/R/parsed-site-data.csv.gz'))

diff --git a/misc/parseOSD-dev/fulltext-to-merge.R b/misc/parseOSD-dev/fulltext-to-merge.R
@@ -1,55 +1,34 @@
 
 
 
+## expected sections and ordering
 
-#   # get rendered HTML->text and save to file
-#   # store gzip-compressd OSD for bulk INSERT
-#   res[['fulltext']] <- memCompress(.ConvertToFullTextRecord(i, i.lines), type='gzip')
-#
-#   ## previously:
-#   # cat(i.fulltext, file = 'fulltext-data.sql', append = TRUE)
-#
-#   # split data into sections for fulltext search, catch errors related to parsing sections
-#   i.sections <- try(.ConvertToFullTextRecord2(i, i.lines))
-#   if(class(i.sections) != 'try-error') {
-#
-#     # store gzip-compressed sections for bulk INSERT
-#     res[['sections']] <- memCompress(i.sections, type='gzip')
-#
-#     ## previously:
-#     # cat(i.sections, file = 'fulltext-section-data.sql', append = TRUE)
+## mapping: get_OSD()  -------->  SoilWeb fulltext search
 
+# OVERVIEW                           brief_narrative text
+# TAXONOMIC.CLASS                    taxonomic_class text
+# TYPICAL.PEDON                      typical_pedon text
+# TYPE.LOCATION                      type_location text
+# RANGE.IN.CHARACTERISTICS           ric text
+# COMPETING.SERIES                   competing_series text
+# GEOGRAPHIC.SETTING                 geog_location text
+# GEOGRAPHICALLY.ASSOCIATED.SOILS    geog_assoc_soils text
+# DRAINAGE.AND.PERMEABILITY          drainage text
+# USE.AND.VEGETATION                 use_and_veg text
+# DISTRIBUTION.AND.EXTENT            distribution text
+# REMARKS                            remarks text
+# ORIGIN                             established text
+# ADDITIONAL.DATA                    additional_data text
 
 
 
-# # re-make entire fulltext table, containing an OSD per record
-.makeFullTextTable <- function(fullTextList, outputFile='fulltext-data.sql') {
-  # reset fulltext SQL file
-  cat('DROP TABLE osd.osd_fulltext;\n', file = outputFile)
-  cat('CREATE TABLE osd.osd_fulltext (series citext, fulltext text);\n',
-      file = outputFile, append = TRUE)
-  cat("set client_encoding to 'latin1' ;\n", file = outputFile, append = TRUE)
-
-  # remove NULL elements
-  idx <- which(! sapply(fullTextList, is.null))
-  fullTextList <- fullTextList[idx]
-
-  # iterate over list elements and write to file
-  # source text is gzip compressed
-  n <- lapply(fullTextList, function(i) {
-    # decompress text on the fly
-    txt <- memDecompress(i, type = 'gzip', asChar = TRUE)
-    cat(txt, file = outputFile, append = TRUE)
-  })
-}
-
-
 # # re-make sectioned fulltext table, containing an OSD per record
 .makeFullTextSectionsTable <- function(fullTextList, outputFile='fulltext-section-data.sql') {
 
   # reset fulltext SQL file
   # need to adjust fields manually as we edit
-  cat('DROP TABLE osd.osd_fulltext2;\n', file='fulltext-section-data.sql')
+  cat('DROP TABLE osd.osd_fulltext2;\n', file = 'fulltext-section-data.sql')
+
   cat('CREATE TABLE osd.osd_fulltext2 (
         series citext,
         brief_narrative text,
@@ -67,10 +46,70 @@
         established text,
         additional_data text
       );\n', file='fulltext-section-data.sql', append = TRUE)
+
   cat("set client_encoding to 'latin1' ;\n", file = 'fulltext-section-data.sql', append = TRUE)
 
+  ## 2022-02-15: there are no NULL elements
+  ## remove NULL elements
+  # idx <- which(!sapply(fullTextList, is.null))
+  # fullTextList <- fullTextList[idx]
+
+  ## TODO: iterate with for-loop + progress
+
+  # iterate over list elements and write to file
+  # source text is gzip compressed
+  n <- lapply(fullTextList, function(i) {
+    # decompress text on the fly
+    txt <- memDecompress(i, type = 'gzip', asChar = TRUE)
+    cat(txt, file = outputFile, append = TRUE)
+  })
+
+}
+
+
+## expected sections and ordering
+
+# c("OVERVIEW", "TAXONOMIC.CLASS", "TYPICAL.PEDON", "TYPE.LOCATION", "RANGE.IN.CHARACTERISTICS", "COMPETING.SERIES", "GEOGRAPHIC.SETTING", "GEOGRAPHICALLY.ASSOCIATED.SOILS", "DRAINAGE.AND.PERMEABILITY", "USE.AND.VEGETATION", "DISTRIBUTION.AND.EXTENT", "ADDITIONAL.DATA")
+
+# convert HTML text to an insert statement with data split by section (columns)
+.ConvertToFullTextRecord2 <- function(s, narrativeList, tablename = 'osd.osd_fulltext2') {
+
+  # combine sections with $$ quoting
+  blob <- sapply(narrativeList, function(i) {
+    # convert NA -> ''
+    i <- ifelse(is.na(i), '', i)
+    # safely quote
+    paste0('$$', i, '$$')
+  })
+
+  # each series is collapsed into a single INSERT statement
+  res <- paste0('INSERT INTO ', tablename,  ' VALUES ( $$',
+                s, '$$, ', paste(blob, collapse = ', '), ');\n')
+
+  return(res)
+}
+
+
+
+
+
+
+
+
+
+
+## TODO: this will require a different approach
+
+# # re-make entire fulltext table, containing an OSD per record
+.makeFullTextTable <- function(fullTextList, outputFile='fulltext-data.sql') {
+  # reset fulltext SQL file
+  cat('DROP TABLE osd.osd_fulltext;\n', file = outputFile)
+  cat('CREATE TABLE osd.osd_fulltext (series citext, fulltext text);\n',
+      file = outputFile, append = TRUE)
+  cat("set client_encoding to 'latin1' ;\n", file = outputFile, append = TRUE)
+
   # remove NULL elements
-  idx <- which(!sapply(fullTextList, is.null))
+  idx <- which(! sapply(fullTextList, is.null))
   fullTextList <- fullTextList[idx]
 
   # iterate over list elements and write to file
@@ -94,21 +133,26 @@
   return(res)
 }
 
-# # convert HTML text to an insert statement with data split by section
-.ConvertToFullTextRecord2 <- function(s, s.lines, tablename='osd.osd_fulltext2') {
-  # TODO: convert to use SKB sections
-  # split sections to list, section titles hard-coded
-  sections <- .extractSections(s.lines)
-
-  # get names of all sections
-  st <- names(.sectionData)
-
-  # combine sections with $$ quoting
-  blob <- sapply(st, function(i) {
-    paste0('$$', sections[[i]], '$$')
-  })
-  res <- paste0('INSERT INTO ', tablename,  ' VALUES ( $$',
-                s, '$$, ', paste(blob, collapse = ', '), ');\n')
-  return(res)
-}
+
+
+
+
+#   # get rendered HTML->text and save to file
+#   # store gzip-compressd OSD for bulk INSERT
+#   res[['fulltext']] <- memCompress(.ConvertToFullTextRecord(i, i.lines), type='gzip')
+#
+#   ## previously:
+#   # cat(i.fulltext, file = 'fulltext-data.sql', append = TRUE)
+#
+#   # split data into sections for fulltext search, catch errors related to parsing sections
+#   i.sections <- try(.ConvertToFullTextRecord2(i, i.lines))
+#   if(class(i.sections) != 'try-error') {
+#
+#     # store gzip-compressed sections for bulk INSERT
+#     res[['sections']] <- memCompress(i.sections, type='gzip')
+#
+#     ## previously:
+#     # cat(i.sections, file = 'fulltext-section-data.sql', append = TRUE)
+
+
 
diff --git a/misc/parseOSD-dev/prepare-NASIS-OSD-sections.R b/misc/parseOSD-dev/prepare-NASIS-OSD-sections.R
@@ -0,0 +1,30 @@
+library(soilDB)
+
+
+x <- get_OSD('peters')
+
+# remove section title, not present in all sections
+cleanSectionTitle <- function(s) {
+  .text <- gsub(pattern = "^[a-zA-Z1-9 ]+\\s?:\\s*", replacement = "", x = s)
+  return(.text)
+}
+
+# section chunks
+chunkNames <- c("OVERVIEW", 
+       "TAXONOMIC.CLASS", "TYPICAL.PEDON", "TYPE.LOCATION", "RANGE.IN.CHARACTERISTICS", 
+       "COMPETING.SERIES", "GEOGRAPHIC.SETTING", "GEOGRAPHICALLY.ASSOCIATED.SOILS", 
+       "DRAINAGE.AND.PERMEABILITY", "USE.AND.VEGETATION", "DISTRIBUTION.AND.EXTENT", 
+       "REGIONAL.OFFICE", "ORIGIN", "REMARKS", "ADDITIONAL.DATA")
+
+# iterate over subset of section names, just those we intend to insert into the OSD table
+x.clean <- lapply(chunkNames, function(i) {
+  # remove section title if present
+  res <- cleanSectionTitle(x[[i]])
+  return(res)
+})
+
+# reset names
+names(x.clean) <- chunkNames
+
+
+