add hydrate_tweets (#260)

* Update utils.R Added an option to retrieve erroneous tweets with their error title. Changes are made in get_tweets() and df_to_json() in order to bind the erroneous tweets to the bound dataframe and make error_-json files in the data_path * Create tweets_lookup Function to access the API v2 Tweets Lookup (https://developer.twitter.com/en/docs/twitter-api/tweets/lookup/introduction) function. This is useful when looking to rehydrate tweets via their tweet IDs. An option to catch errors in the lookup process is added (see changes to utils for implementation) * Update tweets_lookup added total success overview at the end of the loop * Make runnable; pass all old unit tests * Add Tim König a contributor * Exclude the `errors` part for now (It will be for another PR) * Add documentation and rename `tweets_lookup` to `hydrate_tweets` * Add tests for corner cases * Add basic test cases * Clear check messages / errors * Correct documentation of `hydrate_tweets` on `context_annotations` * Update hydrate_tweets.R minor adjustment to make sure the reporting doesn't break the function when bind_tweets == F ("Retrieved" and "Total Tweets" verbose messages rely on the new_rows object which is only made if tweets are bound) * Reduce some verbosity Co-authored-by: chainsawriot <[email protected]>
cjbarrie · Dec 17, 2021 · 85f82a8 · 85f82a8
1 parent 352e359
commit 85f82a8
Show file tree

Hide file tree

Showing 15 changed files with 532 additions and 2 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -21,7 +21,11 @@ Authors@R:
 	   family = "Rico",
 	   role = c("ctb"),
 	   email = "[email protected]",
-	   comment = c(ORCID = "0000-0002-6169-4523")))
+	   comment = c(ORCID = "0000-0002-6169-4523")),
+	   person(given = "Tim",
+	   family = "König",
+	   role = c("ctb"),
+	   comment = c(ORCID = "0000-0002-2852-2690")))
 Description: Package to query the Twitter Academic Research Product Track,
     providing access to full-archive search and other v2 API endpoints. Functions
     are written with academic research in mind. They provide flexibility in how 
@@ -48,7 +52,8 @@ Imports:
     tidyr,
     tidyselect,
     purrr,
-    rlang
+    rlang,
+    utils
 Suggests: 
     knitr,
     rmarkdown,

diff --git a/NAMESPACE b/NAMESPACE
@@ -33,6 +33,7 @@ export(get_user_profile)
 export(get_user_timeline)
 export(get_user_tweets)
 export(get_video_tweets)
+export(hydrate_tweets)
 export(list_compliance_jobs)
 export(resume_collection)
 export(set_bearer)

diff --git a/R/hydrate_tweets.R b/R/hydrate_tweets.R
@@ -0,0 +1,82 @@
+#' Hydrate Tweets Based On Tweet IDs
+#'
+#' This function is helpful for hydrating Tweet IDs (i.e. getting the full content of tweets from a list of Tweet IDs).
+#' @inheritParams get_all_tweets
+#' @param ids a character vector of Tweet IDs
+#' @param context_annotations If `TRUE`, context_annotations will be fetched.
+#' @return When bind_tweets is `TRUE`, the function returns a data frame. The `data_path` if `bind_tweets` is `FALSE`
+#' @examples
+#' \dontrun{
+#' hydrate_tweets(c("1266876474440761346", "1266868259925737474", "1266867327079002121",
+#' "1266866660713127936", "1266864490446012418", "1266860737244336129",
+#' "1266859737615826944", "1266859455586676736", "1266858090143588352",
+#' "1266857669157097473"))
+#' }
+#' @export
+hydrate_tweets <- function(ids,  bearer_token = get_bearer(), data_path = NULL,
+                           context_annotations = FALSE,
+                           bind_tweets = TRUE,
+                           verbose = TRUE) {
+  ## verbose = TRUE,
+  ## errors = FALSE) {
+  ## Building parameters for get_tweets()
+  if (is.null(data_path) & !bind_tweets) {
+    stop("Argument (bind_tweets = FALSE) only valid when a data_path is specified.")
+  }
+  params <- list(
+    tweet.fields = "attachments,author_id,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,source,text,withheld", 
+    user.fields = "created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld", 
+    expansions = "author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id", 
+    place.fields = "contained_within,country,country_code,full_name,geo,id,name,place_type"
+  )
+  if (context_annotations) {
+    params[["tweet.fields"]] <- "attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,source,text,withheld"
+  }
+  ## loop through x in batches of 100 IDs
+  new_df <- data.frame()
+  if (length(ids) >= 1) {
+    n_batches <- ceiling(length(ids) / 100)
+  } else {
+    n_batches <- 0
+  }
+  endpoint_url <- "https://api.twitter.com/2/tweets"
+  for (i in seq_len(n_batches)) {
+    batch <- ids[((i-1)*100+1):min(length(ids),(i*100))]
+    params[["ids"]] <- paste0(batch, collapse = ",")
+
+    ## Get tweets
+    .vcat(verbose, "Batch", i, "out of", ceiling(length(ids) / 100),": ids", utils::head(batch, n = 1), "to", utils::tail(batch, n = 1), "\n")
+    ## new_rows <- get_tweets(params = params, endpoint_url = endpoint_url, n = Inf, file = NULL, bearer_token = bearer_token, 
+    ##                        export_query = FALSE, data_path = data_path, bind_tweets = bind_tweets, verbose = verbose, errors = errors)
+    new_rows <- get_tweets(params = params, endpoint_url = endpoint_url, n = Inf, file = NULL, bearer_token = bearer_token, 
+                           export_query = FALSE, data_path = data_path, bind_tweets = bind_tweets, verbose = FALSE)
+
+    if (bind_tweets) {
+    ## if (errors){
+    ##   .vcat(verbose, "Retrieved", nrow(dplyr::filter(new_rows, is.na(error))), "out of", length(batch), "\n" , 
+    ##         "Errors:", nrow(dplyr::filter(new_rows, !is.na(error))), "\n" )
+    ## } else {
+    .vcat(verbose, "Retrieved", nrow(new_rows), "out of", length(batch), "\n")
+    ## }
+
+      ##  new_rows$from_tweet_id <- batch[batch %in% new_rows$id]
+      if (nrow(new_rows) > 0) { 
+        new_df <- dplyr::bind_rows(new_df, new_rows) # add new rows
+      }
+
+    ## if (errors) {
+    ##   .vcat(verbose, "Total Tweets:", nrow(dplyr::filter(new_df, is.na(error))), "\n")
+    ## } else {
+    .vcat(verbose, "Total Tweets:", nrow(new_df), "\n")
+    ##   }
+    ## }
+    ## if (errors) {
+    ##   .vcat(verbose, "Total of", nrow(dplyr::filter(new_df, is.na(error))), "out of", length(ids), "tweets retrieved.\n")
+    ## }
+    }
+  }
+  if (bind_tweets) {
+    return(new_df)
+  }
+  return(invisible(data_path))
+}
diff --git a/man/hydrate_tweets.Rd b/man/hydrate_tweets.Rd
diff --git a/tests/testdata/fff_de.RDS b/tests/testdata/fff_de.RDS
diff --git a/tests/testthat/api.twitter.com/2/tweets-2b14f0.R b/tests/testthat/api.twitter.com/2/tweets-2b14f0.R
diff --git a/tests/testthat/api.twitter.com/2/tweets-302f98.R b/tests/testthat/api.twitter.com/2/tweets-302f98.R
diff --git a/tests/testthat/api.twitter.com/2/tweets-7486fa.R b/tests/testthat/api.twitter.com/2/tweets-7486fa.R
diff --git a/tests/testthat/api.twitter.com/2/tweets-80de3f.R b/tests/testthat/api.twitter.com/2/tweets-80de3f.R
diff --git a/tests/testthat/api.twitter.com/2/tweets-9df0f5.R b/tests/testthat/api.twitter.com/2/tweets-9df0f5.R
diff --git a/tests/testthat/api.twitter.com/2/tweets-c4000b.R b/tests/testthat/api.twitter.com/2/tweets-c4000b.R
diff --git a/tests/testthat/api.twitter.com/2/tweets-e1624c.R b/tests/testthat/api.twitter.com/2/tweets-e1624c.R
diff --git a/tests/testthat/api.twitter.com/2/tweets-e5b077.R b/tests/testthat/api.twitter.com/2/tweets-e5b077.R
diff --git a/tests/testthat/api.twitter.com/2/tweets-f81159.R b/tests/testthat/api.twitter.com/2/tweets-f81159.R
@@ -0,0 +1,35 @@
+structure(list(url = "https://api.twitter.com/2/tweets?tweet.fields=attachments%2Cauthor_id%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Cpossibly_sensitive%2Creferenced_tweets%2Csource%2Ctext%2Cwithheld&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&expansions=author_id%2Centities.mentions.username%2Cgeo.place_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&ids=1266876474440761346", 
+    status_code = 200L, headers = structure(list(date = "Thu, 16 Dec 2021 16:37:21 UTC", 
+        server = "tsa_o", `api-version` = "2.32", `content-type` = "application/json; charset=utf-8", 
+        `cache-control` = "no-cache, no-store, max-age=0", `content-length` = "943", 
+        `x-access-level` = "read", `x-frame-options` = "SAMEORIGIN", 
+        `content-encoding` = "gzip", `x-xss-protection` = "0", 
+        `x-rate-limit-limit` = "300", `x-rate-limit-reset` = "1639673360", 
+        `content-disposition` = "attachment; filename=json.json", 
+        `x-content-type-options` = "nosniff", `x-rate-limit-remaining` = "295", 
+        `strict-transport-security` = "max-age=631138519", `x-response-time` = "235", 
+        `x-connection-hash` = "5496ad10d9eafbe645adefab538b1910ec789fbddb69cce5e9a87564645c0780"), class = c("insensitive", 
+    "list")), all_headers = list(list(status = 200L, version = "HTTP/2", 
+        headers = structure(list(date = "Thu, 16 Dec 2021 16:37:21 UTC", 
+            server = "tsa_o", `api-version` = "2.32", `content-type` = "application/json; charset=utf-8", 
+            `cache-control` = "no-cache, no-store, max-age=0", 
+            `content-length` = "943", `x-access-level` = "read", 
+            `x-frame-options` = "SAMEORIGIN", `content-encoding` = "gzip", 
+            `x-xss-protection` = "0", `x-rate-limit-limit` = "300", 
+            `x-rate-limit-reset` = "1639673360", `content-disposition` = "attachment; filename=json.json", 
+            `x-content-type-options` = "nosniff", `x-rate-limit-remaining` = "295", 
+            `strict-transport-security` = "max-age=631138519", 
+            `x-response-time` = "235", `x-connection-hash` = "5496ad10d9eafbe645adefab538b1910ec789fbddb69cce5e9a87564645c0780"), class = c("insensitive", 
+        "list")))), cookies = structure(list(domain = c(".twitter.com", 
+    ".twitter.com", ".twitter.com", ".twitter.com"), flag = c(TRUE, 
+    TRUE, TRUE, TRUE), path = c("/", "/", "/", "/"), secure = c(TRUE, 
+    TRUE, TRUE, TRUE), expiration = structure(c(1702744284, 1702744284, 
+    1702744284, 1702744284), class = c("POSIXct", "POSIXt")), 
+        name = c("guest_id_marketing", "guest_id_ads", "personalization_id", 
+        "guest_id"), value = c("REDACTED", "REDACTED", "REDACTED", 
+        "REDACTED")), row.names = c(NA, -4L), class = "data.frame"), 
+    content = charToRaw("{\"data\":[{\"lang\":\"de\",\"conversation_id\":\"1266876474440761346\",\"author_id\":\"282135623\",\"text\":\"diese Petition mitzeichnen!!\\n. \\nKonsequente Ausrichtung eines zukünftigen Konjunkturpakets.\\nNur soziale und technische Innovationen, #Klimaschutz und gesetzlich verankerte Gemeinwohlorientierung machen Deutschland zukunftsfähig.\\n#Datteln4\\n#FridaysForFuture\\nhttps://t.co/eZ8j61yftj\",\"public_metrics\":{\"retweet_count\":2,\"reply_count\":0,\"like_count\":3,\"quote_count\":0},\"possibly_sensitive\":false,\"created_at\":\"2020-05-30T23:37:43.000Z\",\"entities\":{\"urls\":[{\"start\":257,\"end\":280,\"url\":\"https://t.co/eZ8j61yftj\",\"expanded_url\":\"https://epetitionen.bundestag.de/petitionen/_2020/_04/_21/Petition_110043.%24%24%24.a.u.html\",\"display_url\":\"epetitionen.bundestag.de/petitionen/_20…\"}],\"hashtags\":[{\"start\":133,\"end\":145,\"tag\":\"Klimaschutz\"},{\"start\":229,\"end\":238,\"tag\":\"Datteln4\"},{\"start\":239,\"end\":256,\"tag\":\"FridaysForFuture\"}]},\"source\":\"Twitter for Android\",\"id\":\"1266876474440761346\"}],\"includes\":{\"users\":[{\"pinned_tweet_id\":\"1469634511591944194\",\"public_metrics\":{\"followers_count\":3373,\"following_count\":3311,\"tweet_count\":52294,\"listed_count\":0},\"protected\":false,\"name\":\"W. Heinrich ✌️\",\"created_at\":\"2011-04-14T16:25:59.000Z\",\"username\":\"1xKlaudius\",\"profile_image_url\":\"https://pbs.twimg.com/profile_images/1336439514785574915/yo0Fgf8u_normal.jpg\",\"verified\":false,\"location\":\"Franken / Bayern\",\"id\":\"282135623\",\"url\":\"\",\"description\":\"Kritisch - politisch Bio.... LinksGrün versifft... das ist gut so!\\nFalls jemand fragt...? \\nIch bin Radelfahrer bei jeder Gelegenheit und\\nschon lange Autofrei.\"}]}}"), 
+    date = structure(1639672641, class = c("POSIXct", "POSIXt"
+    ), tzone = "GMT"), times = c(redirect = 0, namelookup = 2.8e-05, 
+    connect = 2.9e-05, pretransfer = 0.000109, starttransfer = 0.251372, 
+    total = 0.251693)), class = "response")
diff --git a/tests/testthat/test-hydrate.R b/tests/testthat/test-hydrate.R
@@ -0,0 +1,85 @@
+## These tests are using cases where ids are all valid.
+
+test_that("Corner cases", {
+  expect_error(capture_warnings(hydrate_tweets()))
+  expect_error(capture_warnings(hydrate_tweets(c())), NA)
+  capture_warnings(res <- hydrate_tweets(c()))
+  expect_equal(nrow(res), 0)
+  expect_equal(class(res), "data.frame")
+})
+
+## require(httptest)
+## start_capturing(simplify = FALSE)
+## fff <- readRDS("../testdata/fff_de.RDS")
+## hydrate_tweets(fff, verbose = FALSE)
+## stop_capturing()
+
+with_mock_api({
+  test_that("normal case: fff de", {
+    skip_if(!dir.exists("api.twitter.com"))
+    fff <- readRDS("../testdata/fff_de.RDS")
+    expect_error(res <- hydrate_tweets(fff, verbose = FALSE), NA)
+    expect_equal(nrow(res), length(fff))
+  })
+  test_that("normal case: verbose", {
+    skip_if(!dir.exists("api.twitter.com"))
+    fff <- readRDS("../testdata/fff_de.RDS")
+    expect_silent(res <- hydrate_tweets(fff, verbose = FALSE))
+    expect_output(capture_warnings(res <- hydrate_tweets(fff, verbose = TRUE)))
+  })
+  test_that("normal case: bind_tweets", {
+    skip_if(!dir.exists("api.twitter.com"))
+    fff <- readRDS("../testdata/fff_de.RDS")
+    emptydir <- academictwitteR:::.gen_random_dir()  
+    expect_silent(res <- hydrate_tweets(fff, verbose = FALSE, data_path = emptydir))
+    expect_true(length(list.files(emptydir, "json$")) > 0)
+    expect_error(z <- bind_tweets(emptydir, verbose = FALSE), NA)
+    expect_equal(length(fff), nrow(z))
+    unlink(emptydir, recursive = TRUE)
+    emptydir <- academictwitteR:::.gen_random_dir()  
+    expect_silent(capture_warnings(res <- hydrate_tweets(fff, verbose = FALSE, data_path = emptydir, bind_tweets = FALSE)))
+    ## error when data_path is null and bind_tweets is FALSE
+    ## the same expected behavior to `get_all_tweets`
+    expect_error(capture_warnings(res <- hydrate_tweets(fff, data_path = NULL, bind_tweets = FALSE)))
+  })
+})
+
+## require(httptest)
+## start_capturing(simplify = FALSE)
+## fff <- readRDS("../testdata/fff_de.RDS")
+## hydrate_tweets(fff, verbose = FALSE, context_annotations = TRUE)
+## stop_capturing()
+
+with_mock_api({
+  test_that("normal case: context_anntations", {
+    skip_if(!dir.exists("api.twitter.com"))
+    fff <- readRDS("../testdata/fff_de.RDS")
+    ca1 <- hydrate_tweets(fff, verbose = FALSE, context_annotations = TRUE)
+    expect_true("context_annotations" %in% colnames(ca1))
+    expect_equal(nrow(ca1), length(fff))
+    ca0 <- hydrate_tweets(fff, verbose = FALSE, context_annotations = FALSE)
+    expect_false("context_annotations" %in% colnames(ca0))
+    expect_equal(nrow(ca0), length(fff))
+  })
+})
+
+## require(httptest)
+## start_capturing(simplify = FALSE)
+## fff <- readRDS("../testdata/fff_de.RDS")
+## manyf <- c(fff, fff, fff)
+## for (i in c(1, 99, 100, 199, 200, 250)) {
+##   hydrate_tweets(manyf[seq_len(i)], verbose = FALSE)
+## }
+## stop_capturing()
+
+with_mock_api({
+  test_that("normal case: different sizes", {
+    skip_if(!dir.exists("api.twitter.com"))
+    fff <- readRDS("../testdata/fff_de.RDS")
+    manyf <- c(fff, fff, fff)
+    for (i in c(1, 99, 100, 199, 200, 250)) {
+      expect_error(res <- hydrate_tweets(manyf[seq_len(i)], verbose = FALSE), NA)
+      expect_equal(nrow(res), i)
+    }
+  })
+})