From 0fe9d1c08a555adb51eaff0948e5e229fcac04d2 Mon Sep 17 00:00:00 2001 From: chainsawriot Date: Wed, 22 Nov 2023 11:22:21 +0100 Subject: [PATCH] Make `remove_padding` work --- R/get_dist.R | 19 ++++++++++--------- man/dfm.tokens_with_proximity.Rd | 8 ++++---- man/tokens_proximity.Rd | 7 ++----- tests/testthat/test-dfm.R | 8 ++++++++ 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/R/get_dist.R b/R/get_dist.R index e2f7132..6757545 100644 --- a/R/get_dist.R +++ b/R/get_dist.R @@ -62,11 +62,8 @@ pp <- function(pattern) { #' @details Proximity is measured by the number of tokens away from the keyword. Given a tokenized sentence: \["I", "eat", "this", "apple"\] and suppose "eat" is the keyword. The vector of minimum proximity for each word from "eat" is \[2, 1, 2, 3\], if `count_from` is 1. In another case: \["I", "wash", "and", "eat", "this", "apple"\] and \["wash", "eat"\] are the keywords. The minimal distance vector is \[2, 1, 2, 1, 2, 3\]. If `get_min` is `FALSE`, the output is a list of two vectors. For "wash", the distance vector is \[1, 0, 1, 2, 3\]. For "eat", \[3, 2, 1, 0, 1, 2\]. #' Please conduct all text maniputation tasks with `tokens_*()` functions before calling this function. To convert the output back to a `tokens` object, use [quanteda::as.tokens()]. #' @return a `tokens_with_proximity` object. It is similar to [quanteda::tokens()], but only [dfm.tokens_with_proximity()], [quanteda::convert()], [quanteda::docvars()], and [quanteda::meta()] methods are available. A `tokens_with_proximity` has a modified [print()] method. Also, additional data slots are included -#' * a document variation `dist` -#' * a metadata slot `keywords` -#' * a metadata slot `get_min` -#' * a metadata slot `tolower` -#' * a metadata slot `keep_acronyms` +#' * a document variable `proximity` +#' * metadata slots for all arguments used #' @examples #' library(quanteda) #' tok1 <- data_char_ukimmig2010 %>% @@ -193,14 +190,14 @@ tokens_proximity_tolower <- function(x) { #' Construct a sparse document-feature matrix from the output of [tokens_proximity()]. #' @param x output of [tokens_proximity()]. #' @param tolower convert all features to lowercase. -#' @param remove_padding ignored. -#' @param remove_docvars_proximity boolean, remove the "proximity" document variable. -#' @param verbose ignored, +#' @param remove_padding logical; if `TRUE`, remove the "pads" left as empty tokens after calling [quanteda::tokens()] or [quanteda::tokens_remove()] with `padding = TRUE`. +#' @param remove_docvars_proximity logical, remove the "proximity" document variable. +#' @param verbose display messages if `TRUE`. #' @param weight_function a weight function, default to invert distance, #' @param ... not used. #' @importFrom quanteda dfm #' @return a [quanteda::dfm-class] object -#' @details By default, words closer to keywords are weighted higher. You might change that with another `weight_function`. Please also note that `tolower` and `remove_padding` have no effect. It is because changing tokens at this point would need to recalculate the proximity vectors. Please do all the text manipulation before running [tokens_proximity()]. +#' @details By default, words closer to keywords are weighted higher. You might change that with another `weight_function`. #' @examples #' library(quanteda) #' tok1 <- data_char_ukimmig2010 %>% @@ -262,5 +259,9 @@ dfm.tokens_with_proximity <- function(x, tolower = TRUE, remove_padding = FALSE, x_docvars$proximity <- NULL } quanteda::docvars(output) <- x_docvars + if (remove_padding) { + output <- quanteda::dfm_select(output, pattern = "", select = "remove", valuetype = "fixed", padding = FALSE, + verbose = verbose) + } return(output) } diff --git a/man/dfm.tokens_with_proximity.Rd b/man/dfm.tokens_with_proximity.Rd index ffa5bcc..56133d8 100644 --- a/man/dfm.tokens_with_proximity.Rd +++ b/man/dfm.tokens_with_proximity.Rd @@ -21,11 +21,11 @@ \item{tolower}{convert all features to lowercase.} -\item{remove_padding}{ignored.} +\item{remove_padding}{logical; if \code{TRUE}, remove the "pads" left as empty tokens after calling \code{\link[quanteda:tokens]{quanteda::tokens()}} or \code{\link[quanteda:tokens_select]{quanteda::tokens_remove()}} with \code{padding = TRUE}.} -\item{verbose}{ignored,} +\item{verbose}{display messages if \code{TRUE}.} -\item{remove_docvars_proximity}{boolean, remove the "proximity" document variable.} +\item{remove_docvars_proximity}{logical, remove the "proximity" document variable.} \item{weight_function}{a weight function, default to invert distance,} @@ -38,7 +38,7 @@ a \link[quanteda:dfm-class]{quanteda::dfm} object Construct a sparse document-feature matrix from the output of \code{\link[=tokens_proximity]{tokens_proximity()}}. } \details{ -By default, words closer to keywords are weighted higher. You might change that with another \code{weight_function}. Please also note that \code{tolower} and \code{remove_padding} have no effect. It is because changing tokens at this point would need to recalculate the proximity vectors. Please do all the text manipulation before running \code{\link[=tokens_proximity]{tokens_proximity()}}. +By default, words closer to keywords are weighted higher. You might change that with another \code{weight_function}. } \examples{ library(quanteda) diff --git a/man/tokens_proximity.Rd b/man/tokens_proximity.Rd index 2d08d19..50e0a0d 100644 --- a/man/tokens_proximity.Rd +++ b/man/tokens_proximity.Rd @@ -35,11 +35,8 @@ tokens_proximity( \value{ a \code{tokens_with_proximity} object. It is similar to \code{\link[quanteda:tokens]{quanteda::tokens()}}, but only \code{\link[=dfm.tokens_with_proximity]{dfm.tokens_with_proximity()}}, \code{\link[quanteda:convert]{quanteda::convert()}}, \code{\link[quanteda:docvars]{quanteda::docvars()}}, and \code{\link[quanteda:meta]{quanteda::meta()}} methods are available. A \code{tokens_with_proximity} has a modified \code{\link[=print]{print()}} method. Also, additional data slots are included \itemize{ -\item a document variation \code{dist} -\item a metadata slot \code{keywords} -\item a metadata slot \code{get_min} -\item a metadata slot \code{tolower} -\item a metadata slot \code{keep_acronyms} +\item a document variable \code{proximity} +\item metadata slots for all arguments used } } \description{ diff --git a/tests/testthat/test-dfm.R b/tests/testthat/test-dfm.R index 8ea5b74..b2ed545 100644 --- a/tests/testthat/test-dfm.R +++ b/tests/testthat/test-dfm.R @@ -36,3 +36,11 @@ test_that("Padding #46", { expect_error(toks %>% tokens_proximity("a") %>% dfm(), NA) }) +test_that("remove_padding", { + suppressPackageStartupMessages(library(quanteda)) + toks <- tokens(c("a b c", "A B C D")) %>% tokens_remove("b", padding = TRUE) + output <- toks %>% tokens_proximity("a") %>% dfm() + expect_true("" %in% colnames(output)) + output <- toks %>% tokens_proximity("a") %>% dfm(remove_padding = TRUE) + expect_false("" %in% colnames(output)) +})