From 4e0c8f878b51c552be1394e6c661e023e1b46dc6 Mon Sep 17 00:00:00 2001 From: trinker Date: Thu, 2 Oct 2014 12:36:42 -0400 Subject: [PATCH] `all_words` gains `char.keep` and `char2space` arguments to enable retention of characters and multi word phrases. These features are passed to `freq_terms` as well. Suggestd by stackoverflow's lawyeR (http://stackoverflow.com/a/26162401/1000343). --- NEWS | 5 +++ NEWS.md | 5 +++ R/all_words.R | 78 ++++++++++++++++++++++++++++------------------- R/freq_terms.R | 3 ++ man/all_words.Rd | 13 +++++++- man/freq_terms.Rd | 3 ++ 6 files changed, 75 insertions(+), 32 deletions(-) diff --git a/NEWS b/NEWS index 114b32d3..0ac850d5 100644 --- a/NEWS +++ b/NEWS @@ -44,6 +44,11 @@ MINOR FEATURES IMPROVEMENTS + `all_words` gains `char.keep` and `char2space` arguments to enable retention + of characters and multi word phrases. These features are passed to + `freq_terms` as well. Suggestd by stackoverflow's lawyeR + (http://stackoverflow.com/a/26162401/1000343). + CHANGES * `rm_url` has been moved into its own canned regex pattern extraction/replacer diff --git a/NEWS.md b/NEWS.md index 19c000c2..5c322ebe 100644 --- a/NEWS.md +++ b/NEWS.md @@ -44,6 +44,11 @@ And constructed with the following guidelines: **IMPROVEMENTS** + `all_words` gains `char.keep` and `char2space` arguments to enable retention + of characters and multi word phrases. These features are passed to + `freq_terms` as well. Suggestd by stackoverflow's lawyeR + (http://stackoverflow.com/a/26162401/1000343). + **CHANGES** * `rm_url` has been moved into its own canned regex pattern extraction/replacer diff --git a/R/all_words.R b/R/all_words.R index 38eedd9a..eca974c3 100644 --- a/R/all_words.R +++ b/R/all_words.R @@ -13,6 +13,11 @@ #' \code{FALSE} orders the rows by descending frequency. #' @param apostrophe.remove logical. If \code{TRUE} removes apostrophes from #' the text before examining. +#' @param char.keep A character vector of symbol character (i.e., punctuation) +#' that strip should keep. The default is to strip everything except +#' apostrophes. This enables the use of special characters to be turned into +#' spaces or for characters to be retained. +#' @param char2space A vector of characters to be turned into spaces. #' @param \ldots Other argument supplied to \code{\link[qdap]{strip}}. #' @return Returns a dataframe with frequency counts of words that begin with or #' contain the provided word chunk. @@ -39,35 +44,47 @@ #' #' ## Filter by nchar and stopwords #' Filter(head(x3), min = 3) +#' +#' ## Keep spaces +#' all_words(space_fill(DATA$state, c("are you", "can be"))) #' } all_words <- -function(text.var, begins.with = NULL, contains = NULL, alphabetical = TRUE, apostrophe.remove = FALSE, ...){ - if (!is.null(begins.with) & !is.null(contains)) { - stop("Can not use both 'begins.with' & 'contains' arguments") - } - if(!is.null(begins.with)) begins.with <- tolower(begins.with) - if(!is.null(contains)) contains <- tolower(contains) - WORDS <- unlist(bag_o_words(strip(text.var, apostrophe.remove = apostrophe.remove, ...))) - names(WORDS) <- NULL - y <- data.frame(table(WORDS), stringsAsFactors = FALSE, row.names=NULL) - names(y) <- c("WORD", "FREQ") - y$WORD <- as.character(y$WORD) - y[, "FREQ"] <- as.numeric(as.character(y[, "FREQ"])) - if (!is.null(begins.with)) { - y <- y[substring(y[, 1], 1, nchar(begins.with)) %in% begins.with, ] - if(nrow(y)==0) stop("No words match") - } - if (!is.null(contains)) { - y <- y[grep(contains, y[, 1]), ] - if(nrow(y)==0) stop("No words match") - } - if (!alphabetical) { - y <- y[order(-y$FREQ, y$WORD), ] + function(text.var, begins.with = NULL, contains = NULL, alphabetical = TRUE, + apostrophe.remove = FALSE, char.keep = char2space, char2space = "~~", ...){ + + if (!is.null(begins.with) & !is.null(contains)) { + stop("Can not use both 'begins.with' & 'contains' arguments") + } + if(!is.null(begins.with)) begins.with <- tolower(begins.with) + if(!is.null(contains)) contains <- tolower(contains) + + char.keep <- unique(c(char2space, char.keep)) + + WORDS <- unlist(bag_o_words(text.var, apostrophe.remove = apostrophe.remove, + char.keep, char.keep, ...), use.names=FALSE) + + y <- data.frame(table(WORDS), stringsAsFactors = FALSE, row.names=NULL) + names(y) <- c("WORD", "FREQ") + y$WORD <- as.character(y$WORD) + y[, "FREQ"] <- as.numeric(as.character(y[, "FREQ"])) + if (!is.null(begins.with)) { + y <- y[substring(y[, 1], 1, nchar(begins.with)) %in% begins.with, ] + if(nrow(y)==0) stop("No words match") + } + if (!is.null(contains)) { + y <- y[grep(contains, y[, 1]), ] + if(nrow(y)==0) stop("No words match") + } + if (!alphabetical) { + y <- y[order(-y$FREQ, y$WORD), ] + } + if (!is.null(char2space)) { + y[["WORD"]] <- mgsub(char2space, " ", y[["WORD"]]) + } + p <- class(y) + class(y) <- c("all_words", p) + y } - p <- class(y) - class(y) <- c("all_words", p) - y -} #' Prints an all_words Object #' @@ -93,12 +110,11 @@ print.all_words <- function(x, ...) { #' @method Filter all_words #' @return \code{Filter.all_words} - Returns a matrix of the class "all_words". Filter.all_words <- function(x, min = 1, max = Inf, - count.apostrophe = TRUE, stopwords = NULL, ignore.case = TRUE, ...) { - + count.apostrophe = TRUE, stopwords = NULL, ignore.case = TRUE, ...) { + word_list_filter_helper(x, min = min, max = max, - count.apostrophe = count.apostrophe, stopwords = stopwords, - ignore.case = ignore.case, ...) + count.apostrophe = count.apostrophe, stopwords = stopwords, + ignore.case = ignore.case, ...) } - diff --git a/R/freq_terms.R b/R/freq_terms.R index 5420bb8b..d7a508b2 100644 --- a/R/freq_terms.R +++ b/R/freq_terms.R @@ -47,6 +47,9 @@ #' plot(out[[i]], plot=FALSE) + ggtitle(names(out)[i]) #' }) #' dev.off() +#' +#' ## Keep spaces +#' freq_terms(space_fill(DATA$state, "are you"), 500, char.keep="~~") #' } freq_terms <- function(text.var, top = 20, at.least = 1, stopwords = NULL, diff --git a/man/all_words.Rd b/man/all_words.Rd index a742da02..860ea25c 100644 --- a/man/all_words.Rd +++ b/man/all_words.Rd @@ -4,7 +4,8 @@ \title{Searches Text Column for Words} \usage{ all_words(text.var, begins.with = NULL, contains = NULL, - alphabetical = TRUE, apostrophe.remove = FALSE, ...) + alphabetical = TRUE, apostrophe.remove = FALSE, char.keep = char2space, + char2space = "~~", ...) } \arguments{ \item{text.var}{The text variable.} @@ -21,6 +22,13 @@ Use this if searching for a word containing the word chunk.} \item{apostrophe.remove}{logical. If \code{TRUE} removes apostrophes from the text before examining.} +\item{char.keep}{A character vector of symbol character (i.e., punctuation) +that strip should keep. The default is to strip everything except +apostrophes. This enables the use of special characters to be turned into +spaces or for characters to be retained.} + +\item{char2space}{A vector of characters to be turned into spaces.} + \item{\ldots}{Other argument supplied to \code{\link[qdap]{strip}}.} } \value{ @@ -54,6 +62,9 @@ head(x5) ## Filter by nchar and stopwords Filter(head(x3), min = 3) + +## Keep spaces +all_words(space_fill(DATA$state, c("are you", "can be"))) } } \seealso{ diff --git a/man/freq_terms.Rd b/man/freq_terms.Rd index 1e804f47..62111363 100644 --- a/man/freq_terms.Rd +++ b/man/freq_terms.Rd @@ -59,6 +59,9 @@ lapply(seq_along(out), function(i) { plot(out[[i]], plot=FALSE) + ggtitle(names(out)[i]) }) dev.off() + +## Keep spaces +freq_terms(space_fill(DATA$state, "are you"), 500, char.keep="~~") } } \seealso{