diff --git a/NAMESPACE b/NAMESPACE index 483d6ec..435de21 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,10 @@ # Generated by roxygen2: do not edit by hand S3method(as.tokens,tokens_with_tokenvars) +S3method(docvars,tokens_with_tokenvars) S3method(print,tokens_with_tokenvars) export("tokenvars<-") export(tokens_add_tokenvars) export(tokenvars) importFrom(quanteda,as.tokens) +importFrom(quanteda,docvars) diff --git a/R/tokenvars.R b/R/tokenvars.R index 97fe9d8..7c08ee9 100644 --- a/R/tokenvars.R +++ b/R/tokenvars.R @@ -59,13 +59,73 @@ as.tokens.tokens_with_tokenvars <- function(x, remove_tokenvars = TRUE, ...) { return(x) } +#' @export +#' @method docvars tokens_with_tokenvars +#' @importFrom quanteda docvars +docvars.tokens_with_tokenvars <- function(x, field = NULL) { + return(docvars(as.tokens(x, remove_tokenvars = FALSE), field = field)) +} + +print_item <- function(x, flatten, tokenids) { + for (i in seq_along(x)) { + cat("[", tokenids[i], "] ", x[i], " (", flatten[i], ") ", sep = "") + } +} + +flat_tokenvars <- function(df) { + vapply(seq_len(nrow(df)), function(y) paste(as.character(df[y,]), collapse = "|"), "") +} + #' @export print.tokens_with_tokenvars <- function(x, max_ndoc = quanteda::quanteda_options("print_tokens_max_ndoc"), max_ntoken = quanteda::quanteda_options("print_tokens_max_ntoken"), show_summary = quanteda::quanteda_options("print_tokens_summary"), ...) { - ## TODO - print(as.tokens(x, remove_tokenvars = FALSE), max_ndoc = max_ndoc, max_ntoken = max_ntoken, show_summary = show_summary) - cat("With Token Variables.\n") + ## modified from quanteda::print.tokens + ##print(as.tokens(x, remove_tokenvars = FALSE), max_ndoc = max_ndoc, max_ntoken = max_ntoken, show_summary = show_summary) + ndoc <- length(x) + docvars <- docvars(x) + xtokenvars <- tokenvars(x) + if (max_ndoc < 0) { + max_ndoc <- ndoc + } + if (show_summary) { + cat("Tokens consisting of ", format(ndoc, big.mark = ","), " document", + if (ndoc != 1L) "s" else "", sep = "") + if (ncol(docvars)) { + cat(" and ", format(ncol(docvars), big.mark = ","), " docvar", + if (ncol(docvars) != 1L) "s" else "", sep = "") + } + cat(".\n") + if (ncol(xtokenvars[[1]])) { + cat("Token variables: (", paste(names(xtokenvars[[1]]), collapse = "|"), ").\n", sep = "") + } + } + if (max_ndoc > 0 && ndoc > 0) { + subsetted_x <- head(x, max_ndoc) + xtokenvars <- head(xtokenvars, max_ndoc) + docids <- paste0(names(subsetted_x), " :") + types <- c("", attr(x, "types")) + len <- lengths(subsetted_x) + if (max_ntoken < 0) { + max_ntoken <- max(len) + } + tokens_to_display <- lapply(unclass(subsetted_x), function(y) types[head(y, max_ntoken) + 1]) + flatten_tokenvars <- lapply(xtokenvars, flat_tokenvars) + tokenids <- lapply(subsetted_x, names) + for (i in seq_along(docids)) { + cat(docids[i], "\n", sep = "") + print_item(tokens_to_display[[i]], flatten_tokenvars[[i]], tokenids[[i]]) + if (len[i] > max_ntoken) { + cat("{ ... and ", format(len[i] - max_ntoken, big.mark = ","), " more }\n", sep = "") + } + cat("\n", sep = "") + } + ndoc_rem <- ndoc - max_ndoc + if (ndoc_rem > 0) { + cat("{ reached max_ndoc ... ", format(ndoc_rem, big.mark = ","), " more document", + if (ndoc_rem > 1) "s", " }\n", sep = "") + } + } } make_tokenvars <- function(unclassed_x) { @@ -86,13 +146,3 @@ add_tokenid <- function(unclassed_x) { } return(unclassed_x) } - -pp <- function(x, max_ndoc = quanteda::quanteda_options("print_tokens_max_ndoc"), - max_ntoken = quanteda::quanteda_options("print_tokens_max_ntoken"), - show_summary = quanteda::quanteda_options("print_tokens_summary"), ...) { - ## Pretty print; probably I can't hijack quanteda::print.tokens - if (is.null(attr(x, "tokenvars"))) { - print(x, max_ndoc = max_ndoc, max_ntoken = max_ntoken, show_summary = show_summary, ...) - return(invisible(NULL)) - } -} diff --git a/README.Rmd b/README.Rmd index f5dc470..a67cb7a 100644 --- a/README.Rmd +++ b/README.Rmd @@ -55,13 +55,17 @@ tokenvars(tok, "lemma") <- list(c("spaCy", "be", "great", "at", "fast", "natural ``` ```{r example4} -tokenvars(tok) +tok ``` ```{r example5} -tokenvars(tok, field = "tag") +tokenvars(tok) ``` ```{r example6} +tokenvars(tok, field = "tag") +``` + +```{r example7} tokenvars(tok, field = "lemma", docid = "d2") ``` diff --git a/README.md b/README.md index ae46733..c5df260 100644 --- a/README.md +++ b/README.md @@ -37,14 +37,9 @@ tok <- tokens(corp) %>% tokens_add_tokenvars() tok #> Tokens consisting of 2 documents. #> d1 : -#> [1] "spaCy" "is" "great" "at" "fast" -#> [6] "natural" "language" "processing" "." -#> +#> [t1] spaCy () [t2] is () [t3] great () [t4] at () [t5] fast () [t6] natural () [t7] language () [t8] processing () [t9] . () #> d2 : -#> [1] "Mr" "." "Smith" "spent" "two" "years" -#> [7] "in" "North" "Carolina" "." -#> -#> With Token Variables. +#> [t1] Mr () [t2] . () [t3] Smith () [t4] spent () [t5] two () [t6] years () [t7] in () [t8] North () [t9] Carolina () [t10] . () ``` ``` r @@ -63,6 +58,16 @@ tokenvars(tok, "lemma") <- list(c("spaCy", "be", "great", "at", "fast", "natural c("Mr", ".", "Smith", "spend", "two", "year", "in", "North", "Carolina", ".")) ``` +``` r +tok +#> Tokens consisting of 2 documents. +#> Token variables: (tag|lemma). +#> d1 : +#> [t1] spaCy (NNP|spaCy) [t2] is (VBZ|be) [t3] great (JJ|great) [t4] at (IN|at) [t5] fast (JJ|fast) [t6] natural (JJ|natural) [t7] language (NN|language) [t8] processing (NN|processing) [t9] . (.|.) +#> d2 : +#> [t1] Mr (NNP|Mr) [t2] . (.|.) [t3] Smith (NNP|Smith) [t4] spent (VBD|spend) [t5] two (CD|two) [t6] years (NNS|year) [t7] in (IN|in) [t8] North (NNP|North) [t9] Carolina (NNP|Carolina) [t10] . (.|.) +``` + ``` r tokenvars(tok) #> $d1