Update doc with the Hungarian notation ref #27

gesistsa · Nov 17, 2023 · a4587d2 · a4587d2
1 parent 45b0dbd
commit a4587d2
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 62 deletions.
diff --git a/R/get_dist.R b/R/get_dist.R
@@ -38,27 +38,27 @@ get_proximity <- function(x, keywords, get_min = TRUE, count_from = 1) {
     grep(regex, features, value = TRUE)
 }
 
-#' Extract Distance Information
+#' Extract Proximity Information
 #'
 #' This function extracts distance information from a [quanteda::tokens()] object.
 #' @param x a `tokens` object
-#' @param keywords a character vector of anchor words
+#' @param pattern Pattern for selecting keywords, see [quanteda::pattern] for details.
 #' @param get_min logical, whether to return only the minimum distance or raw distance information; it is more relevant when `keywords` have more than one word. See details.
 #' @param valuetype See [quanteda::valuetype]
-#' @param count_from numeric, how proximity is counted from when `get_min` is `TRUE`. The keyword is assigned with this proximity. Default to 1 (not zero) to prevent division by 0 with the default behavior of [dfm.tokens_with_proximity()].
+#' @param count_from numeric, how proximity is counted from when `get_min` is `TRUE`. The keyword is assigned with this proximity. Default to 1 (not zero) to prevent division by 0 with the default behaviour of [dfm.tokens_with_proximity()].
 #' @details Proximity is measured by the number of tokens away from the keyword. Given a tokenized sentence: \["I", "eat", "this", "apple"\] and suppose "eat" is the target. The vector of minimum proximity for each word from "eat" is \[2, 1, 2, 3\], if `count_from` is 1. In another case: \["I", "wash", "and", "eat", "this", "apple"\] and \["wash", "eat"\] are the keywords. The minimal distance vector is \[2, 1, 2, 1, 2, 3\]. If `get_min` is `FALSE`, the output is a list of two vectors. For "wash", the distance vector is \[1, 0, 1, 2, 3\]. For "eat", \[3, 2, 1, 0, 1, 2\].
-#' It is recommended to conduct all text maniputation tasks with all `tokens_*()` functions before calling this function.
+#' It is recommended conducting all text maniputation tasks with `tokens_*()` functions before calling this function.
 #' @return a `tokens_with_proximity` object. It is a derivative of [quanteda::tokens()], i.e. all `token_*` functions still work. A `tokens_with_proximity` has a modified [print()] method. Also, additional data slots are included
 #' * a document variation `dist`
 #' * a metadata slot `keywords`
 #' * a metadata slot `get_min`
 #' @examples
 #' library(quanteda)
-#' ukimg_eu <- data_char_ukimmig2010 %>%
+#' tok1 <- data_char_ukimmig2010 %>%
 #'     tokens(remove_punct = TRUE) %>%
 #'     tokens_tolower() %>%
 #'     tokens_proximity(c("eu", "euro*"))
-#' ukimg_eu %>%
+#' tok1 %>%
 #'     dfm() %>%
 #'     dfm_select(c("immig*", "migr*")) %>%
 #'     rowSums() %>%
@@ -72,7 +72,7 @@ get_proximity <- function(x, keywords, get_min = TRUE, count_from = 1) {
 #'     rowSums() %>%
 #'     sort()
 #' ## rerun to select other keywords
-#' ukimg_eu %>% tokens_proximity("britain")
+#' tok1 %>% tokens_proximity("britain")
 #' @seealso [dfm.tokens_with_proximity()] [quanteda::tokens()]
 #' @export
 tokens_proximity <- function(x, keywords, get_min = TRUE, valuetype = c("glob", "regex", "fixed"), count_from = 1) {
@@ -139,22 +139,22 @@ convert.tokens_with_proximity <- function(x, to = c("data.frame"), ...) {
 #' @details By default, words closer to keywords are weighted higher. You might change that with another `weight_function`. Please also note that `tolower` and `remove_padding` have no effect. It is because changing tokens at this point would need to recalculate the proximity vectors. Please do all the text manipulation before running [tokens_proximity()].
 #' @examples
 #' library(quanteda)
-#' ukimg_eu <- data_char_ukimmig2010 %>%
+#' tok1 <- data_char_ukimmig2010 %>%
 #'     tokens(remove_punct = TRUE) %>%
 #'     tokens_tolower() %>%
 #'     tokens_proximity(c("eu", "europe", "european"))
-#' ukimg_eu %>%
+#' tok1 %>%
 #'     dfm() %>%
 #'     dfm_select(c("immig*", "migr*")) %>%
 #'     rowSums() %>%
 #'     sort()
 #' ## Words further away from keywords are weighted higher
-#' ukimg_eu %>%
+#' tok1 %>%
 #'     dfm(weight_function = identity) %>%
 #'     dfm_select(c("immig*", "migr*")) %>%
 #'     rowSums() %>%
 #'     sort()
-#' ukimg_eu %>%
+#' tok1 %>%
 #'     dfm(weight_function = function(x) {
 #'         1 / x^2
 #'     }) %>%

diff --git a/README.Rmd b/README.Rmd
@@ -32,78 +32,78 @@ remotes::install_github("gesistsa/quanteda.proximity")
 ## Example
 
 ```{r example}
-suppressPackageStartupMessages(library(quanteda))
+library(quanteda, quietly = TRUE)
 library(quanteda.proximity)
 
-testdata <-
+txt1 <-
 c("Turkish President Tayyip Erdogan, in his strongest comments yet on the Gaza conflict, said on Wednesday the Palestinian militant group Hamas was not a terrorist organisation but a liberation group fighting to protect Palestinian lands.",
 "EU policymakers proposed the new agency in 2021 to stop financial firms from aiding criminals and terrorists. Brussels has so far relied on national regulators with no EU authority to stop money laundering and terrorist financing running into billions of euros.")
 ```
 
 `tokens_proximity()` generates the proximity vectors and stores them as a `docvar` (document variable).
 
 ```{r tokens_proximity}
-res <- testdata %>% tokens() %>% tokens_tolower() %>%
+tok1 <- txt1 %>% tokens() %>% tokens_tolower() %>%
     tokens_proximity(keywords = "turkish")
-res
+tok1
 ```
 
 You can access the proximity vectors by
 
 ```{r proximity_vectors}
-docvars(res, "proximity")
+docvars(tok1, "proximity")
 ```
 
 The `tokens` object with proximity vectors can be converted to a (weighted) `dfm` (Document-Feature Matrix). The default weight is assigned by inverting the proximity.
 
 ```{r dfm}
-dfm(res)
+dfm(tok1)
 ```
 
 You have the freedom to change to another weight function. For example, not inverting.
 
 ```{r dfm2}
-dfm(res, weight_function = identity)
+dfm(tok1, weight_function = identity)
 ```
 
 Or any custom function
 
 ```{r dfm3}
-dfm(res, weight_function = function(x) { 1 / x^2 })
+dfm(tok1, weight_function = function(x) { 1 / x^2 })
 ```
 
 ## Application
 
 A clumsy example to calculate the total inverse proximity weighted frequency of "terror*" words.
 
 ```{r}
-terror_dict <- dictionary(list(TERROR = c("terror*")))
+dict1 <- dictionary(list(TERROR = c("terror*")))
 
-dfm(res) %>% dfm_lookup(terror_dict) %>% rowSums()
+dfm(tok1) %>% dfm_lookup(dict1) %>% rowSums()
 ```
 
 How about changing the target to "Hamas"?
 
 ```{r}
-res2 <- res %>% tokens_proximity(keywords = "hamas")
-res2
+tok2 <- tok1 %>% tokens_proximity(keywords = "hamas")
+tok2
 ```
 
 ```{r}
-dfm(res2) %>% dfm_lookup(terror_dict) %>% rowSums()
+dfm(tok2) %>% dfm_lookup(dict1) %>% rowSums()
 ```
 
 Can we use two targets, e.g. "EU" and "Brussels"?
 
 ```{r}
-res3 <- res %>% tokens_proximity(keywords = c("eu", "brussels"))
-res3
+tok3 <- tok1 %>% tokens_proximity(keywords = c("eu", "brussels"))
+tok3
 ```
 
 ```{r}
-docvars(res3, "proximity")
+docvars(tok3, "proximity")
 ```
 
 ```{r}
-dfm(res3) %>% dfm_lookup(terror_dict) %>% rowSums()
+dfm(tok3) %>% dfm_lookup(dict1) %>% rowSums()
 ```
diff --git a/README.md b/README.md
@@ -22,10 +22,15 @@ remotes::install_github("gesistsa/quanteda.proximity")
 ## Example
 
 ``` r
-suppressPackageStartupMessages(library(quanteda))
+library(quanteda, quietly = TRUE)
+#> Package version: 3.3.1
+#> Unicode version: 14.0
+#> ICU version: 70.1
+#> Parallel computing: 8 of 8 threads used.
+#> See https://quanteda.io for tutorials and examples.
 library(quanteda.proximity)
 
-testdata <-
+txt1 <-
 c("Turkish President Tayyip Erdogan, in his strongest comments yet on the Gaza conflict, said on Wednesday the Palestinian militant group Hamas was not a terrorist organisation but a liberation group fighting to protect Palestinian lands.",
 "EU policymakers proposed the new agency in 2021 to stop financial firms from aiding criminals and terrorists. Brussels has so far relied on national regulators with no EU authority to stop money laundering and terrorist financing running into billions of euros.")
 ```
@@ -34,9 +39,9 @@ c("Turkish President Tayyip Erdogan, in his strongest comments yet on the Gaza c
 a `docvar` (document variable).
 
 ``` r
-res <- testdata %>% tokens() %>% tokens_tolower() %>%
+tok1 <- txt1 %>% tokens() %>% tokens_tolower() %>%
     tokens_proximity(keywords = "turkish")
-res
+tok1
 #> Tokens consisting of 2 documents and 1 docvar.
 #> text1 :
 #>  [1] "turkish"   "president" "tayyip"    "erdogan"   ","         "in"       
@@ -56,7 +61,7 @@ res
 You can access the proximity vectors by
 
 ``` r
-docvars(res, "proximity")
+docvars(tok1, "proximity")
 #> $text1
 #>  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
 #> [26] 26 27 28 29 30 31 32 33 34 35 36 37 38
@@ -71,7 +76,7 @@ The `tokens` object with proximity vectors can be converted to a
 assigned by inverting the proximity.
 
 ``` r
-dfm(res)
+dfm(tok1)
 #> Document-feature matrix of: 2 documents, 64 features (45.31% sparse) and 0 docvars.
 #>        features
 #> docs    turkish president    tayyip erdogan         ,         in       his
@@ -88,7 +93,7 @@ You have the freedom to change to another weight function. For example,
 not inverting.
 
 ``` r
-dfm(res, weight_function = identity)
+dfm(tok1, weight_function = identity)
 #> Document-feature matrix of: 2 documents, 64 features (45.31% sparse) and 0 docvars.
 #>        features
 #> docs    turkish president tayyip erdogan  , in his strongest comments yet
@@ -100,7 +105,7 @@ dfm(res, weight_function = identity)
 Or any custom function
 
 ``` r
-dfm(res, weight_function = function(x) { 1 / x^2 })
+dfm(tok1, weight_function = function(x) { 1 / x^2 })
 #> Document-feature matrix of: 2 documents, 64 features (45.31% sparse) and 0 docvars.
 #>        features
 #> docs    turkish president    tayyip erdogan          ,           in        his
@@ -119,18 +124,18 @@ A clumsy example to calculate the total inverse proximity weighted
 frequency of "terror\*" words.
 
 ``` r
-terror_dict <- dictionary(list(TERROR = c("terror*")))
+dict1 <- dictionary(list(TERROR = c("terror*")))
 
-dfm(res) %>% dfm_lookup(terror_dict) %>% rowSums()
+dfm(tok1) %>% dfm_lookup(dict1) %>% rowSums()
 #>      text1      text2 
 #> 0.03703704 0.04545455
 ```
 
 How about changing the target to “Hamas”?
 
 ``` r
-res2 <- res %>% tokens_proximity(keywords = "hamas")
-res2
+tok2 <- tok1 %>% tokens_proximity(keywords = "hamas")
+tok2
 #> Tokens consisting of 2 documents and 1 docvar.
 #> text1 :
 #>  [1] "turkish"   "president" "tayyip"    "erdogan"   ","         "in"       
@@ -148,16 +153,16 @@ res2
 ```
 
 ``` r
-dfm(res2) %>% dfm_lookup(terror_dict) %>% rowSums()
+dfm(tok2) %>% dfm_lookup(dict1) %>% rowSums()
 #>      text1      text2 
 #> 0.20000000 0.04545455
 ```
 
 Can we use two targets, e.g. “EU” and “Brussels”?
 
 ``` r
-res3 <- res %>% tokens_proximity(keywords = c("eu", "brussels"))
-res3
+tok3 <- tok1 %>% tokens_proximity(keywords = c("eu", "brussels"))
+tok3
 #> Tokens consisting of 2 documents and 1 docvar.
 #> text1 :
 #>  [1] "turkish"   "president" "tayyip"    "erdogan"   ","         "in"       
@@ -175,7 +180,7 @@ res3
 ```
 
 ``` r
-docvars(res3, "proximity")
+docvars(tok3, "proximity")
 #> $text1
 #>  [1] 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39
 #> [26] 39 39 39 39 39 39 39 39 39 39 39 39 39
@@ -186,7 +191,7 @@ docvars(res3, "proximity")
 ```
 
 ``` r
-dfm(res3) %>% dfm_lookup(terror_dict) %>% rowSums()
+dfm(tok3) %>% dfm_lookup(dict1) %>% rowSums()
 #>      text1      text2 
 #> 0.02564103 0.45833333
 ```
diff --git a/man/dfm.tokens_with_proximity.Rd b/man/dfm.tokens_with_proximity.Rd
diff --git a/man/tokens_proximity.Rd b/man/tokens_proximity.Rd