add print method ref #2

gesistsa · Nov 26, 2023 · 37816fe · 37816fe
1 parent 90e45b1
commit 37816fe
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 22 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,8 +1,10 @@
 # Generated by roxygen2: do not edit by hand
 
 S3method(as.tokens,tokens_with_tokenvars)
+S3method(docvars,tokens_with_tokenvars)
 S3method(print,tokens_with_tokenvars)
 export("tokenvars<-")
 export(tokens_add_tokenvars)
 export(tokenvars)
 importFrom(quanteda,as.tokens)
+importFrom(quanteda,docvars)
diff --git a/R/tokenvars.R b/R/tokenvars.R
@@ -59,13 +59,73 @@ as.tokens.tokens_with_tokenvars <- function(x, remove_tokenvars = TRUE, ...) {
     return(x)
 }
 
+#' @export
+#' @method docvars tokens_with_tokenvars
+#' @importFrom quanteda docvars
+docvars.tokens_with_tokenvars <- function(x, field = NULL) {
+    return(docvars(as.tokens(x, remove_tokenvars = FALSE), field = field))
+}
+
+print_item <- function(x, flatten, tokenids) {
+    for (i in seq_along(x)) {
+        cat("[", tokenids[i], "] ", x[i], " (", flatten[i], ") ", sep = "")
+    }
+}
+
+flat_tokenvars <- function(df) {
+    vapply(seq_len(nrow(df)), function(y) paste(as.character(df[y,]), collapse = "|"), "")
+}
+
 #' @export
 print.tokens_with_tokenvars <- function(x, max_ndoc = quanteda::quanteda_options("print_tokens_max_ndoc"),
                max_ntoken = quanteda::quanteda_options("print_tokens_max_ntoken"),
                show_summary = quanteda::quanteda_options("print_tokens_summary"), ...) {
-    ## TODO
-    print(as.tokens(x, remove_tokenvars = FALSE), max_ndoc = max_ndoc, max_ntoken = max_ntoken, show_summary = show_summary)
-    cat("With Token Variables.\n")
+    ## modified from quanteda::print.tokens
+    ##print(as.tokens(x, remove_tokenvars = FALSE), max_ndoc = max_ndoc, max_ntoken = max_ntoken, show_summary = show_summary)
+    ndoc <- length(x)
+    docvars <- docvars(x)
+    xtokenvars <- tokenvars(x)
+    if (max_ndoc < 0) {
+        max_ndoc <- ndoc
+    }
+    if (show_summary) {
+        cat("Tokens consisting of ", format(ndoc, big.mark = ","), " document",
+            if (ndoc != 1L) "s" else "", sep = "")
+        if (ncol(docvars)) {
+            cat(" and ", format(ncol(docvars), big.mark = ","), " docvar",
+                if (ncol(docvars) != 1L) "s" else "", sep = "")
+        }
+        cat(".\n")
+        if (ncol(xtokenvars[[1]])) {
+            cat("Token variables: (", paste(names(xtokenvars[[1]]), collapse = "|"), ").\n", sep = "")
+        }
+    }
+    if (max_ndoc > 0 && ndoc > 0) {
+        subsetted_x <- head(x, max_ndoc)
+        xtokenvars <- head(xtokenvars, max_ndoc)
+        docids <- paste0(names(subsetted_x), " :")
+        types <- c("", attr(x, "types"))
+        len <- lengths(subsetted_x)
+        if (max_ntoken < 0) {
+            max_ntoken <- max(len)
+        }
+        tokens_to_display <- lapply(unclass(subsetted_x), function(y) types[head(y, max_ntoken) + 1])
+        flatten_tokenvars <- lapply(xtokenvars, flat_tokenvars)
+        tokenids <- lapply(subsetted_x, names)
+        for (i in seq_along(docids)) {
+            cat(docids[i], "\n", sep = "")
+            print_item(tokens_to_display[[i]], flatten_tokenvars[[i]], tokenids[[i]])
+            if (len[i] > max_ntoken) {
+                cat("{ ... and ",  format(len[i] - max_ntoken, big.mark = ","), " more }\n", sep = "")
+            }
+            cat("\n", sep = "")
+        }
+        ndoc_rem <- ndoc - max_ndoc
+        if (ndoc_rem > 0) {
+            cat("{ reached max_ndoc ... ", format(ndoc_rem, big.mark = ","), " more document",
+                if (ndoc_rem > 1) "s", " }\n", sep = "")
+        }
+    }
 }
 
 make_tokenvars <- function(unclassed_x) {
@@ -86,13 +146,3 @@ add_tokenid <- function(unclassed_x) {
     }
     return(unclassed_x)
 }
-
-pp <- function(x, max_ndoc = quanteda::quanteda_options("print_tokens_max_ndoc"),
-               max_ntoken = quanteda::quanteda_options("print_tokens_max_ntoken"),
-               show_summary = quanteda::quanteda_options("print_tokens_summary"), ...) {
-    ## Pretty print; probably I can't hijack quanteda::print.tokens
-    if (is.null(attr(x, "tokenvars"))) {
-        print(x, max_ndoc = max_ndoc, max_ntoken = max_ntoken, show_summary = show_summary, ...)
-        return(invisible(NULL))
-    }    
-}
diff --git a/README.Rmd b/README.Rmd
@@ -55,13 +55,17 @@ tokenvars(tok, "lemma") <- list(c("spaCy", "be", "great", "at", "fast", "natural
 ```
 
 ```{r example4}
-tokenvars(tok)
+tok
 ```
 
 ```{r example5}
-tokenvars(tok, field = "tag")
+tokenvars(tok)
 ```
 
 ```{r example6}
+tokenvars(tok, field = "tag")
+```
+
+```{r example7}
 tokenvars(tok, field = "lemma", docid = "d2")
 ```
diff --git a/README.md b/README.md
@@ -37,14 +37,9 @@ tok <- tokens(corp) %>% tokens_add_tokenvars()
 tok
 #> Tokens consisting of 2 documents.
 #> d1 :
-#> [1] "spaCy"      "is"         "great"      "at"         "fast"      
-#> [6] "natural"    "language"   "processing" "."         
-#> 
+#> [t1] spaCy () [t2] is () [t3] great () [t4] at () [t5] fast () [t6] natural () [t7] language () [t8] processing () [t9] . () 
 #> d2 :
-#>  [1] "Mr"       "."        "Smith"    "spent"    "two"      "years"   
-#>  [7] "in"       "North"    "Carolina" "."       
-#> 
-#> With Token Variables.
+#> [t1] Mr () [t2] . () [t3] Smith () [t4] spent () [t5] two () [t6] years () [t7] in () [t8] North () [t9] Carolina () [t10] . ()
 ```
 
 ``` r
@@ -63,6 +58,16 @@ tokenvars(tok, "lemma") <- list(c("spaCy", "be", "great", "at", "fast", "natural
                                 c("Mr", ".", "Smith", "spend", "two", "year", "in", "North", "Carolina", "."))
 ```
 
+``` r
+tok
+#> Tokens consisting of 2 documents.
+#> Token variables: (tag|lemma).
+#> d1 :
+#> [t1] spaCy (NNP|spaCy) [t2] is (VBZ|be) [t3] great (JJ|great) [t4] at (IN|at) [t5] fast (JJ|fast) [t6] natural (JJ|natural) [t7] language (NN|language) [t8] processing (NN|processing) [t9] . (.|.) 
+#> d2 :
+#> [t1] Mr (NNP|Mr) [t2] . (.|.) [t3] Smith (NNP|Smith) [t4] spent (VBD|spend) [t5] two (CD|two) [t6] years (NNS|year) [t7] in (IN|in) [t8] North (NNP|North) [t9] Carolina (NNP|Carolina) [t10] . (.|.)
+```
+
 ``` r
 tokenvars(tok)
 #> $d1