From 0fe9d1c08a555adb51eaff0948e5e229fcac04d2 Mon Sep 17 00:00:00 2001
From: chainsawriot <chainsawtiney@gmail.com>
Date: Wed, 22 Nov 2023 11:22:21 +0100
Subject: [PATCH] Make `remove_padding` work

---
 R/get_dist.R                     | 19 ++++++++++---------
 man/dfm.tokens_with_proximity.Rd |  8 ++++----
 man/tokens_proximity.Rd          |  7 ++-----
 tests/testthat/test-dfm.R        |  8 ++++++++
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/R/get_dist.R b/R/get_dist.R
index e2f7132..6757545 100644
--- a/R/get_dist.R
+++ b/R/get_dist.R
@@ -62,11 +62,8 @@ pp <- function(pattern) {
 #' @details Proximity is measured by the number of tokens away from the keyword. Given a tokenized sentence: \["I", "eat", "this", "apple"\] and suppose "eat" is the keyword. The vector of minimum proximity for each word from "eat" is \[2, 1, 2, 3\], if `count_from` is 1. In another case: \["I", "wash", "and", "eat", "this", "apple"\] and \["wash", "eat"\] are the keywords. The minimal distance vector is \[2, 1, 2, 1, 2, 3\]. If `get_min` is `FALSE`, the output is a list of two vectors. For "wash", the distance vector is \[1, 0, 1, 2, 3\]. For "eat", \[3, 2, 1, 0, 1, 2\].
 #' Please conduct all text maniputation tasks with `tokens_*()` functions before calling this function. To convert the output back to a `tokens` object, use [quanteda::as.tokens()].
 #' @return a `tokens_with_proximity` object. It is similar to [quanteda::tokens()], but only [dfm.tokens_with_proximity()], [quanteda::convert()], [quanteda::docvars()], and [quanteda::meta()] methods are available. A `tokens_with_proximity` has a modified [print()] method. Also, additional data slots are included
-#' * a document variation `dist`
-#' * a metadata slot `keywords`
-#' * a metadata slot `get_min`
-#' * a metadata slot `tolower`
-#' * a metadata slot `keep_acronyms`
+#' * a document variable `proximity`
+#' * metadata slots for all arguments used
 #' @examples
 #' library(quanteda)
 #' tok1 <- data_char_ukimmig2010 %>%
@@ -193,14 +190,14 @@ tokens_proximity_tolower <- function(x) {
 #' Construct a sparse document-feature matrix from the output of [tokens_proximity()].
 #' @param x output of [tokens_proximity()].
 #' @param tolower convert all features to lowercase.
-#' @param remove_padding ignored.
-#' @param remove_docvars_proximity boolean, remove the "proximity" document variable.
-#' @param verbose ignored,
+#' @param remove_padding logical; if `TRUE`, remove the "pads" left as empty tokens after calling [quanteda::tokens()] or [quanteda::tokens_remove()] with `padding = TRUE`.
+#' @param remove_docvars_proximity logical, remove the "proximity" document variable.
+#' @param verbose  display messages if `TRUE`.
 #' @param weight_function a weight function, default to invert distance,
 #' @param ... not used.
 #' @importFrom quanteda dfm
 #' @return a [quanteda::dfm-class] object
-#' @details By default, words closer to keywords are weighted higher. You might change that with another `weight_function`. Please also note that `tolower` and `remove_padding` have no effect. It is because changing tokens at this point would need to recalculate the proximity vectors. Please do all the text manipulation before running [tokens_proximity()].
+#' @details By default, words closer to keywords are weighted higher. You might change that with another `weight_function`.
 #' @examples
 #' library(quanteda)
 #' tok1 <- data_char_ukimmig2010 %>%
@@ -262,5 +259,9 @@ dfm.tokens_with_proximity <- function(x, tolower = TRUE, remove_padding = FALSE,
         x_docvars$proximity <- NULL
     }
     quanteda::docvars(output) <- x_docvars
+    if (remove_padding) {
+        output <- quanteda::dfm_select(output, pattern = "", select = "remove", valuetype = "fixed", padding = FALSE,
+                                       verbose = verbose)
+    }
     return(output)
 }
diff --git a/man/dfm.tokens_with_proximity.Rd b/man/dfm.tokens_with_proximity.Rd
index ffa5bcc..56133d8 100644
--- a/man/dfm.tokens_with_proximity.Rd
+++ b/man/dfm.tokens_with_proximity.Rd
@@ -21,11 +21,11 @@
 
 \item{tolower}{convert all features to lowercase.}
 
-\item{remove_padding}{ignored.}
+\item{remove_padding}{logical; if \code{TRUE}, remove the "pads" left as empty tokens after calling \code{\link[quanteda:tokens]{quanteda::tokens()}} or \code{\link[quanteda:tokens_select]{quanteda::tokens_remove()}} with \code{padding = TRUE}.}
 
-\item{verbose}{ignored,}
+\item{verbose}{display messages if \code{TRUE}.}
 
-\item{remove_docvars_proximity}{boolean, remove the "proximity" document variable.}
+\item{remove_docvars_proximity}{logical, remove the "proximity" document variable.}
 
 \item{weight_function}{a weight function, default to invert distance,}
 
@@ -38,7 +38,7 @@ a \link[quanteda:dfm-class]{quanteda::dfm} object
 Construct a sparse document-feature matrix from the output of \code{\link[=tokens_proximity]{tokens_proximity()}}.
 }
 \details{
-By default, words closer to keywords are weighted higher. You might change that with another \code{weight_function}. Please also note that \code{tolower} and \code{remove_padding} have no effect. It is because changing tokens at this point would need to recalculate the proximity vectors. Please do all the text manipulation before running \code{\link[=tokens_proximity]{tokens_proximity()}}.
+By default, words closer to keywords are weighted higher. You might change that with another \code{weight_function}.
 }
 \examples{
 library(quanteda)
diff --git a/man/tokens_proximity.Rd b/man/tokens_proximity.Rd
index 2d08d19..50e0a0d 100644
--- a/man/tokens_proximity.Rd
+++ b/man/tokens_proximity.Rd
@@ -35,11 +35,8 @@ tokens_proximity(
 \value{
 a \code{tokens_with_proximity} object. It is similar to \code{\link[quanteda:tokens]{quanteda::tokens()}}, but only \code{\link[=dfm.tokens_with_proximity]{dfm.tokens_with_proximity()}}, \code{\link[quanteda:convert]{quanteda::convert()}}, \code{\link[quanteda:docvars]{quanteda::docvars()}}, and \code{\link[quanteda:meta]{quanteda::meta()}} methods are available. A \code{tokens_with_proximity} has a modified \code{\link[=print]{print()}} method. Also, additional data slots are included
 \itemize{
-\item a document variation \code{dist}
-\item a metadata slot \code{keywords}
-\item a metadata slot \code{get_min}
-\item a metadata slot \code{tolower}
-\item a metadata slot \code{keep_acronyms}
+\item a document variable \code{proximity}
+\item metadata slots for all arguments used
 }
 }
 \description{
diff --git a/tests/testthat/test-dfm.R b/tests/testthat/test-dfm.R
index 8ea5b74..b2ed545 100644
--- a/tests/testthat/test-dfm.R
+++ b/tests/testthat/test-dfm.R
@@ -36,3 +36,11 @@ test_that("Padding #46", {
     expect_error(toks %>% tokens_proximity("a") %>% dfm(), NA)
 })
 
+test_that("remove_padding", {
+    suppressPackageStartupMessages(library(quanteda))
+    toks <- tokens(c("a b c", "A B C D")) %>% tokens_remove("b", padding = TRUE)
+    output <- toks %>% tokens_proximity("a") %>% dfm()
+    expect_true("" %in% colnames(output))
+    output <- toks %>% tokens_proximity("a") %>% dfm(remove_padding = TRUE)
+    expect_false("" %in% colnames(output))
+})