From 18fc814e500584004778749b84afb48d7e5fcce7 Mon Sep 17 00:00:00 2001 From: "Andrew G. Brown" Date: Wed, 14 Apr 2021 07:33:59 -0700 Subject: [PATCH] Trying tokenizing for identifying similarities in criteria --- misc/formative-elements/fill-missing-taxa.R | 66 +++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 misc/formative-elements/fill-missing-taxa.R diff --git a/misc/formative-elements/fill-missing-taxa.R b/misc/formative-elements/fill-missing-taxa.R new file mode 100644 index 0000000..5919541 --- /dev/null +++ b/misc/formative-elements/fill-missing-taxa.R @@ -0,0 +1,66 @@ +library(dplyr, warn=FALSE) +library(tokenizers) + +scu <- read.csv("~/workspace/SoilTaxonomy/misc/formative-elements/subgroup_criteria_unexplained.csv", stringsAsFactors = FALSE) + +scut <- scu %>% + group_by(element, taxon) %>% + summarize(element = unique(element), + content = paste0(content, collapse = "\n")) + +scut$content_tokens <- tokenizers::tokenize_words(scut$content) + +fmt_colormatch <- lapply(split(scut, f = scut$element), function(scusub) { + + nidx <- 1:nrow(scusub) + + # create token comparison sets; n taxa in a element group get compared to n - 1 non-self criteria + res1 <- lapply(nidx, function(i) + lapply(nidx[nidx != i], function(j) { + tokens <- scusub[i, 'content_tokens'][[1]][[1]] + nonselftokens <- do.call('c', scusub[j, 'content_tokens'][[1]]) + tokens[tokens %in% nonselftokens] + })) + + # take the shortest (most restrictive) set of matches + res2 <- lapply(res1, function(x) x[[which.min(sapply(x, length))[1]]]) + + res3 <- lapply(nidx, function(i) { + foo <- gsub("\xbd|1 /2","1/2", gsub(" "," ",scusub$content[i])) + foo2 <- NULL + try(foo2 <- tolower(foo[1]), silent = TRUE) + if (!is.null(foo2)) { + foo2 <- foo + } else { + cat("### ",unique(scusub$element),"\n\n") + } + if(is.null(res2[[i]])) return(NULL) + res <- as.list(apply(stringr::str_locate(foo2[1], res2[[i]]), 1, + function(x) + if (!is.na(x[1]))# && x[2] - x[1] > 0) + list( + old = substr(foo, x[1], x[2]), + new = crayon::red(substr(foo, x[1], x[2]))) + )) + if(length(res)) + for(j in 1:length(res)) { + re <- res[[j]] + if(!is.null(re)) + foo <- gsub(paste0("\\b",re$old,"\\b"), re$new, foo) + } + cat(foo) + cat("\n\n") + foo + }) +}) + +foo <- lapply(seq_along(fmt_colormatch), function(i) {cat("## ", names(fmt_colormatch)[i], "\n\n") + sapply(fmt_colormatch[[i]], cat, "\n\n")}) + + +a_list <- as.list(rep("",93)) +names(a_list) <- names(fmt_colormatch) +a_list[["vitrixerandic"]] <- "xeric SMR with significant amount of volcanic glass particles" +a_list[["xereptic"]] <- "duripan strongly or less cemented, SMR borders on xeric" +a_list[["xerertic"]] <- "xeric SMR with cracks 5mm or more wide and slickensides or wedge-shaped peds" +a_list[["xerollic"]] <- "xeric SMR with color and base saturation of mollic epipedon"