Skip to content

Commit

Permalink
updated fulltext search source data to use get_OSD output
Browse files Browse the repository at this point in the history
  • Loading branch information
dylanbeaudette committed Feb 15, 2022
1 parent b201838 commit 7dc67f1
Show file tree
Hide file tree
Showing 3 changed files with 177 additions and 61 deletions.
50 changes: 46 additions & 4 deletions misc/parseOSD-dev/create-SoilWeb-OSD-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ library(soilDB)
library(data.table)
library(progress)


## temporary parking place for full text processing functions
source('fulltext-to-merge.R')


# recent SC database
sc <- fread('https://github.com/ncss-tech/SoilWeb-data/raw/main/files/SC-database.csv.gz')
sc <- as.data.frame(sc)
Expand Down Expand Up @@ -33,6 +38,11 @@ idx <- 1:length(sc)
hz.data <- list()
site.data <- list()
missing.file <- list()
fulltext.records <- list()

# section names we will be extracting for SoilWeb / NASIS
section.names <- c("OVERVIEW", "TAXONOMIC.CLASS", "TYPICAL.PEDON", "TYPE.LOCATION", "RANGE.IN.CHARACTERISTICS", "COMPETING.SERIES", "GEOGRAPHIC.SETTING", "GEOGRAPHICALLY.ASSOCIATED.SOILS", "DRAINAGE.AND.PERMEABILITY", "USE.AND.VEGETATION", "DISTRIBUTION.AND.EXTENT", "REMARKS", "ORIGIN", "ADDITIONAL.DATA")


## TODO: convert this to furrr / parallel processing

Expand All @@ -51,6 +61,8 @@ for(i in sc[idx]) {
# * some series in SC may not exist here
# * these files may contain data.frames of varying structure
osddf <- get_OSD(i, result = 'json', base_url = osd.path)

# typical pedon section, already broken into pieces
hz <- osddf[['HORIZONS']][[1]]
s <- osddf[['SITE']][[1]]

Expand All @@ -60,6 +72,7 @@ for(i in sc[idx]) {
next
}

## horizon data
# file exists, but perhaps nothing was extracted... why?
if(inherits(hz, 'data.frame')) {
if(nrow(hz) > 0) {
Expand All @@ -71,6 +84,7 @@ for(i in sc[idx]) {
}
}

## site data
# columns should contain: drainage, drainage_overview, id
if(inherits(s, 'data.frame')) {
# remove 'id' column
Expand All @@ -90,19 +104,43 @@ for(i in sc[idx]) {
site.data[[i]] <- s
}

## attempt narrative chunks
.narratives <- list()
for(sn in section.names) {
.text <- osddf[[sn]]

# remove section title, not present in all sections
.text <- gsub(pattern = "^[a-zA-Z1-9 ]+\\s?:\\s*", replacement = "", x = .text)

# convert NA -> ''

# pack into a list for later
.narratives[[sn]] <- .text
}

## store the section fulltext INSERT statements
# with compression: 61 MB
# without compression: 232 MB
fulltext.records[[i]] <- memCompress(
.ConvertToFullTextRecord2(s = i, narrativeList = .narratives),
type = 'gzip'
)

}


pb$terminate()





## flatten
# missing files: likely old / retired OSDs
missing.file <- as.vector(do.call('c', missing.file))
length(missing.file)

# horizon data: may not share the same column-ordering
# issue: https://github.com/ncss-tech/SoilKnowledgeBase/issues/35
## horizon data: may not share the same column-ordering
hz <- as.data.frame(rbindlist(hz.data, fill = TRUE))

# re-order
Expand All @@ -113,14 +151,18 @@ vars <- c("name", "top", "bottom", "dry_hue", "dry_value", "dry_chroma",

hz <- hz[, vars]

# site data, all items should be conformal
## site data, all items should be conformal
s <- as.data.frame(rbindlist(site.data))


# save
## save hz + site data
write.csv(hz, file = gzfile('parsed-data.csv.gz'), row.names = FALSE)
write.csv(s, file = gzfile('parsed-site-data.csv.gz'), row.names = FALSE)

## re-make section fulltext table + INSERT statements
system.time(.makeFullTextSectionsTable(fulltext.records))


## TODO: investigate missing records, relative to the last time this was run
# nrow(read.csv('E:/working_copies/parse-osd/R/parsed-data.csv.gz'))
# nrow(read.csv('E:/working_copies/parse-osd/R/parsed-site-data.csv.gz'))
Expand Down
158 changes: 101 additions & 57 deletions misc/parseOSD-dev/fulltext-to-merge.R
Original file line number Diff line number Diff line change
@@ -1,55 +1,34 @@



## expected sections and ordering

# # get rendered HTML->text and save to file
# # store gzip-compressd OSD for bulk INSERT
# res[['fulltext']] <- memCompress(.ConvertToFullTextRecord(i, i.lines), type='gzip')
#
# ## previously:
# # cat(i.fulltext, file = 'fulltext-data.sql', append = TRUE)
#
# # split data into sections for fulltext search, catch errors related to parsing sections
# i.sections <- try(.ConvertToFullTextRecord2(i, i.lines))
# if(class(i.sections) != 'try-error') {
#
# # store gzip-compressed sections for bulk INSERT
# res[['sections']] <- memCompress(i.sections, type='gzip')
#
# ## previously:
# # cat(i.sections, file = 'fulltext-section-data.sql', append = TRUE)
## mapping: get_OSD() --------> SoilWeb fulltext search

# OVERVIEW brief_narrative text
# TAXONOMIC.CLASS taxonomic_class text
# TYPICAL.PEDON typical_pedon text
# TYPE.LOCATION type_location text
# RANGE.IN.CHARACTERISTICS ric text
# COMPETING.SERIES competing_series text
# GEOGRAPHIC.SETTING geog_location text
# GEOGRAPHICALLY.ASSOCIATED.SOILS geog_assoc_soils text
# DRAINAGE.AND.PERMEABILITY drainage text
# USE.AND.VEGETATION use_and_veg text
# DISTRIBUTION.AND.EXTENT distribution text
# REMARKS remarks text
# ORIGIN established text
# ADDITIONAL.DATA additional_data text



# # re-make entire fulltext table, containing an OSD per record
.makeFullTextTable <- function(fullTextList, outputFile='fulltext-data.sql') {
# reset fulltext SQL file
cat('DROP TABLE osd.osd_fulltext;\n', file = outputFile)
cat('CREATE TABLE osd.osd_fulltext (series citext, fulltext text);\n',
file = outputFile, append = TRUE)
cat("set client_encoding to 'latin1' ;\n", file = outputFile, append = TRUE)

# remove NULL elements
idx <- which(! sapply(fullTextList, is.null))
fullTextList <- fullTextList[idx]

# iterate over list elements and write to file
# source text is gzip compressed
n <- lapply(fullTextList, function(i) {
# decompress text on the fly
txt <- memDecompress(i, type = 'gzip', asChar = TRUE)
cat(txt, file = outputFile, append = TRUE)
})
}


# # re-make sectioned fulltext table, containing an OSD per record
.makeFullTextSectionsTable <- function(fullTextList, outputFile='fulltext-section-data.sql') {

# reset fulltext SQL file
# need to adjust fields manually as we edit
cat('DROP TABLE osd.osd_fulltext2;\n', file='fulltext-section-data.sql')
cat('DROP TABLE osd.osd_fulltext2;\n', file = 'fulltext-section-data.sql')

cat('CREATE TABLE osd.osd_fulltext2 (
series citext,
brief_narrative text,
Expand All @@ -67,10 +46,70 @@
established text,
additional_data text
);\n', file='fulltext-section-data.sql', append = TRUE)

cat("set client_encoding to 'latin1' ;\n", file = 'fulltext-section-data.sql', append = TRUE)

## 2022-02-15: there are no NULL elements
## remove NULL elements
# idx <- which(!sapply(fullTextList, is.null))
# fullTextList <- fullTextList[idx]

## TODO: iterate with for-loop + progress

# iterate over list elements and write to file
# source text is gzip compressed
n <- lapply(fullTextList, function(i) {
# decompress text on the fly
txt <- memDecompress(i, type = 'gzip', asChar = TRUE)
cat(txt, file = outputFile, append = TRUE)
})

}


## expected sections and ordering

# c("OVERVIEW", "TAXONOMIC.CLASS", "TYPICAL.PEDON", "TYPE.LOCATION", "RANGE.IN.CHARACTERISTICS", "COMPETING.SERIES", "GEOGRAPHIC.SETTING", "GEOGRAPHICALLY.ASSOCIATED.SOILS", "DRAINAGE.AND.PERMEABILITY", "USE.AND.VEGETATION", "DISTRIBUTION.AND.EXTENT", "ADDITIONAL.DATA")

# convert HTML text to an insert statement with data split by section (columns)
.ConvertToFullTextRecord2 <- function(s, narrativeList, tablename = 'osd.osd_fulltext2') {

# combine sections with $$ quoting
blob <- sapply(narrativeList, function(i) {
# convert NA -> ''
i <- ifelse(is.na(i), '', i)
# safely quote
paste0('$$', i, '$$')
})

# each series is collapsed into a single INSERT statement
res <- paste0('INSERT INTO ', tablename, ' VALUES ( $$',
s, '$$, ', paste(blob, collapse = ', '), ');\n')

return(res)
}










## TODO: this will require a different approach

# # re-make entire fulltext table, containing an OSD per record
.makeFullTextTable <- function(fullTextList, outputFile='fulltext-data.sql') {
# reset fulltext SQL file
cat('DROP TABLE osd.osd_fulltext;\n', file = outputFile)
cat('CREATE TABLE osd.osd_fulltext (series citext, fulltext text);\n',
file = outputFile, append = TRUE)
cat("set client_encoding to 'latin1' ;\n", file = outputFile, append = TRUE)

# remove NULL elements
idx <- which(!sapply(fullTextList, is.null))
idx <- which(! sapply(fullTextList, is.null))
fullTextList <- fullTextList[idx]

# iterate over list elements and write to file
Expand All @@ -94,21 +133,26 @@
return(res)
}

# # convert HTML text to an insert statement with data split by section
.ConvertToFullTextRecord2 <- function(s, s.lines, tablename='osd.osd_fulltext2') {
# TODO: convert to use SKB sections
# split sections to list, section titles hard-coded
sections <- .extractSections(s.lines)

# get names of all sections
st <- names(.sectionData)

# combine sections with $$ quoting
blob <- sapply(st, function(i) {
paste0('$$', sections[[i]], '$$')
})
res <- paste0('INSERT INTO ', tablename, ' VALUES ( $$',
s, '$$, ', paste(blob, collapse = ', '), ');\n')
return(res)
}




# # get rendered HTML->text and save to file
# # store gzip-compressd OSD for bulk INSERT
# res[['fulltext']] <- memCompress(.ConvertToFullTextRecord(i, i.lines), type='gzip')
#
# ## previously:
# # cat(i.fulltext, file = 'fulltext-data.sql', append = TRUE)
#
# # split data into sections for fulltext search, catch errors related to parsing sections
# i.sections <- try(.ConvertToFullTextRecord2(i, i.lines))
# if(class(i.sections) != 'try-error') {
#
# # store gzip-compressed sections for bulk INSERT
# res[['sections']] <- memCompress(i.sections, type='gzip')
#
# ## previously:
# # cat(i.sections, file = 'fulltext-section-data.sql', append = TRUE)



30 changes: 30 additions & 0 deletions misc/parseOSD-dev/prepare-NASIS-OSD-sections.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
library(soilDB)


x <- get_OSD('peters')

# remove section title, not present in all sections
cleanSectionTitle <- function(s) {
.text <- gsub(pattern = "^[a-zA-Z1-9 ]+\\s?:\\s*", replacement = "", x = s)
return(.text)
}

# section chunks
chunkNames <- c("OVERVIEW",
"TAXONOMIC.CLASS", "TYPICAL.PEDON", "TYPE.LOCATION", "RANGE.IN.CHARACTERISTICS",
"COMPETING.SERIES", "GEOGRAPHIC.SETTING", "GEOGRAPHICALLY.ASSOCIATED.SOILS",
"DRAINAGE.AND.PERMEABILITY", "USE.AND.VEGETATION", "DISTRIBUTION.AND.EXTENT",
"REGIONAL.OFFICE", "ORIGIN", "REMARKS", "ADDITIONAL.DATA")

# iterate over subset of section names, just those we intend to insert into the OSD table
x.clean <- lapply(chunkNames, function(i) {
# remove section title if present
res <- cleanSectionTitle(x[[i]])
return(res)
})

# reset names
names(x.clean) <- chunkNames



0 comments on commit 7dc67f1

Please sign in to comment.