Skip to content

Commit

Permalink
add print method ref #2
Browse files Browse the repository at this point in the history
  • Loading branch information
chainsawriot committed Nov 26, 2023
1 parent 90e45b1 commit 37816fe
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 22 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# Generated by roxygen2: do not edit by hand

S3method(as.tokens,tokens_with_tokenvars)
S3method(docvars,tokens_with_tokenvars)
S3method(print,tokens_with_tokenvars)
export("tokenvars<-")
export(tokens_add_tokenvars)
export(tokenvars)
importFrom(quanteda,as.tokens)
importFrom(quanteda,docvars)
76 changes: 63 additions & 13 deletions R/tokenvars.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,73 @@ as.tokens.tokens_with_tokenvars <- function(x, remove_tokenvars = TRUE, ...) {
return(x)
}

#' @export
#' @method docvars tokens_with_tokenvars
#' @importFrom quanteda docvars
docvars.tokens_with_tokenvars <- function(x, field = NULL) {
return(docvars(as.tokens(x, remove_tokenvars = FALSE), field = field))
}

print_item <- function(x, flatten, tokenids) {
for (i in seq_along(x)) {
cat("[", tokenids[i], "] ", x[i], " (", flatten[i], ") ", sep = "")
}
}

flat_tokenvars <- function(df) {
vapply(seq_len(nrow(df)), function(y) paste(as.character(df[y,]), collapse = "|"), "")
}

#' @export
print.tokens_with_tokenvars <- function(x, max_ndoc = quanteda::quanteda_options("print_tokens_max_ndoc"),
max_ntoken = quanteda::quanteda_options("print_tokens_max_ntoken"),
show_summary = quanteda::quanteda_options("print_tokens_summary"), ...) {
## TODO
print(as.tokens(x, remove_tokenvars = FALSE), max_ndoc = max_ndoc, max_ntoken = max_ntoken, show_summary = show_summary)
cat("With Token Variables.\n")
## modified from quanteda::print.tokens
##print(as.tokens(x, remove_tokenvars = FALSE), max_ndoc = max_ndoc, max_ntoken = max_ntoken, show_summary = show_summary)
ndoc <- length(x)
docvars <- docvars(x)
xtokenvars <- tokenvars(x)
if (max_ndoc < 0) {
max_ndoc <- ndoc
}
if (show_summary) {
cat("Tokens consisting of ", format(ndoc, big.mark = ","), " document",
if (ndoc != 1L) "s" else "", sep = "")
if (ncol(docvars)) {
cat(" and ", format(ncol(docvars), big.mark = ","), " docvar",
if (ncol(docvars) != 1L) "s" else "", sep = "")
}
cat(".\n")
if (ncol(xtokenvars[[1]])) {
cat("Token variables: (", paste(names(xtokenvars[[1]]), collapse = "|"), ").\n", sep = "")
}
}
if (max_ndoc > 0 && ndoc > 0) {
subsetted_x <- head(x, max_ndoc)
xtokenvars <- head(xtokenvars, max_ndoc)
docids <- paste0(names(subsetted_x), " :")
types <- c("", attr(x, "types"))
len <- lengths(subsetted_x)
if (max_ntoken < 0) {
max_ntoken <- max(len)
}
tokens_to_display <- lapply(unclass(subsetted_x), function(y) types[head(y, max_ntoken) + 1])
flatten_tokenvars <- lapply(xtokenvars, flat_tokenvars)
tokenids <- lapply(subsetted_x, names)
for (i in seq_along(docids)) {
cat(docids[i], "\n", sep = "")
print_item(tokens_to_display[[i]], flatten_tokenvars[[i]], tokenids[[i]])
if (len[i] > max_ntoken) {
cat("{ ... and ", format(len[i] - max_ntoken, big.mark = ","), " more }\n", sep = "")
}
cat("\n", sep = "")
}
ndoc_rem <- ndoc - max_ndoc
if (ndoc_rem > 0) {
cat("{ reached max_ndoc ... ", format(ndoc_rem, big.mark = ","), " more document",
if (ndoc_rem > 1) "s", " }\n", sep = "")
}
}
}

make_tokenvars <- function(unclassed_x) {
Expand All @@ -86,13 +146,3 @@ add_tokenid <- function(unclassed_x) {
}
return(unclassed_x)
}

pp <- function(x, max_ndoc = quanteda::quanteda_options("print_tokens_max_ndoc"),
max_ntoken = quanteda::quanteda_options("print_tokens_max_ntoken"),
show_summary = quanteda::quanteda_options("print_tokens_summary"), ...) {
## Pretty print; probably I can't hijack quanteda::print.tokens
if (is.null(attr(x, "tokenvars"))) {
print(x, max_ndoc = max_ndoc, max_ntoken = max_ntoken, show_summary = show_summary, ...)
return(invisible(NULL))
}
}
8 changes: 6 additions & 2 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,17 @@ tokenvars(tok, "lemma") <- list(c("spaCy", "be", "great", "at", "fast", "natural
```

```{r example4}
tokenvars(tok)
tok
```

```{r example5}
tokenvars(tok, field = "tag")
tokenvars(tok)
```

```{r example6}
tokenvars(tok, field = "tag")
```

```{r example7}
tokenvars(tok, field = "lemma", docid = "d2")
```
19 changes: 12 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,9 @@ tok <- tokens(corp) %>% tokens_add_tokenvars()
tok
#> Tokens consisting of 2 documents.
#> d1 :
#> [1] "spaCy" "is" "great" "at" "fast"
#> [6] "natural" "language" "processing" "."
#>
#> [t1] spaCy () [t2] is () [t3] great () [t4] at () [t5] fast () [t6] natural () [t7] language () [t8] processing () [t9] . ()
#> d2 :
#> [1] "Mr" "." "Smith" "spent" "two" "years"
#> [7] "in" "North" "Carolina" "."
#>
#> With Token Variables.
#> [t1] Mr () [t2] . () [t3] Smith () [t4] spent () [t5] two () [t6] years () [t7] in () [t8] North () [t9] Carolina () [t10] . ()
```

``` r
Expand All @@ -63,6 +58,16 @@ tokenvars(tok, "lemma") <- list(c("spaCy", "be", "great", "at", "fast", "natural
c("Mr", ".", "Smith", "spend", "two", "year", "in", "North", "Carolina", "."))
```

``` r
tok
#> Tokens consisting of 2 documents.
#> Token variables: (tag|lemma).
#> d1 :
#> [t1] spaCy (NNP|spaCy) [t2] is (VBZ|be) [t3] great (JJ|great) [t4] at (IN|at) [t5] fast (JJ|fast) [t6] natural (JJ|natural) [t7] language (NN|language) [t8] processing (NN|processing) [t9] . (.|.)
#> d2 :
#> [t1] Mr (NNP|Mr) [t2] . (.|.) [t3] Smith (NNP|Smith) [t4] spent (VBD|spend) [t5] two (CD|two) [t6] years (NNS|year) [t7] in (IN|in) [t8] North (NNP|North) [t9] Carolina (NNP|Carolina) [t10] . (.|.)
```

``` r
tokenvars(tok)
#> $d1
Expand Down

0 comments on commit 37816fe

Please sign in to comment.