Skip to content
This repository was archived by the owner on Jun 30, 2023. It is now read-only.

Commit

Permalink
add hydrate_tweets (#260)
Browse files Browse the repository at this point in the history
* Update utils.R

Added an option to retrieve erroneous tweets with their error title. Changes are made in get_tweets() and df_to_json() in order to bind the erroneous tweets to the bound dataframe and make error_-json files in the data_path

* Create tweets_lookup

Function to access the API v2 Tweets Lookup (https://developer.twitter.com/en/docs/twitter-api/tweets/lookup/introduction) function. This is useful when looking to rehydrate tweets via their tweet IDs. An option to catch errors in the lookup process is added (see changes to utils for implementation)

* Update tweets_lookup

added total success overview at the end of the loop

* Make runnable; pass all old unit tests

* Add Tim König a contributor

* Exclude the `errors` part for now (It will be for another PR)

* Add documentation and rename `tweets_lookup` to `hydrate_tweets`

* Add tests for corner cases

* Add basic test cases

* Clear check messages / errors

* Correct documentation of `hydrate_tweets` on `context_annotations`

* Update hydrate_tweets.R

minor adjustment to make sure the reporting doesn't break the function when bind_tweets == F ("Retrieved" and "Total Tweets" verbose messages rely on the new_rows object which is only made if tweets are bound)

* Reduce some verbosity

Co-authored-by: chainsawriot <[email protected]>
  • Loading branch information
TimBMK and chainsawriot authored Dec 17, 2021
1 parent 352e359 commit 85f82a8
Show file tree
Hide file tree
Showing 15 changed files with 532 additions and 2 deletions.
9 changes: 7 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@ Authors@R:
family = "Rico",
role = c("ctb"),
email = "[email protected]",
comment = c(ORCID = "0000-0002-6169-4523")))
comment = c(ORCID = "0000-0002-6169-4523")),
person(given = "Tim",
family = "König",
role = c("ctb"),
comment = c(ORCID = "0000-0002-2852-2690")))
Description: Package to query the Twitter Academic Research Product Track,
providing access to full-archive search and other v2 API endpoints. Functions
are written with academic research in mind. They provide flexibility in how
Expand All @@ -48,7 +52,8 @@ Imports:
tidyr,
tidyselect,
purrr,
rlang
rlang,
utils
Suggests:
knitr,
rmarkdown,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ export(get_user_profile)
export(get_user_timeline)
export(get_user_tweets)
export(get_video_tweets)
export(hydrate_tweets)
export(list_compliance_jobs)
export(resume_collection)
export(set_bearer)
Expand Down
82 changes: 82 additions & 0 deletions R/hydrate_tweets.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#' Hydrate Tweets Based On Tweet IDs
#'
#' This function is helpful for hydrating Tweet IDs (i.e. getting the full content of tweets from a list of Tweet IDs).
#' @inheritParams get_all_tweets
#' @param ids a character vector of Tweet IDs
#' @param context_annotations If `TRUE`, context_annotations will be fetched.
#' @return When bind_tweets is `TRUE`, the function returns a data frame. The `data_path` if `bind_tweets` is `FALSE`
#' @examples
#' \dontrun{
#' hydrate_tweets(c("1266876474440761346", "1266868259925737474", "1266867327079002121",
#' "1266866660713127936", "1266864490446012418", "1266860737244336129",
#' "1266859737615826944", "1266859455586676736", "1266858090143588352",
#' "1266857669157097473"))
#' }
#' @export
hydrate_tweets <- function(ids, bearer_token = get_bearer(), data_path = NULL,
context_annotations = FALSE,
bind_tweets = TRUE,
verbose = TRUE) {
## verbose = TRUE,
## errors = FALSE) {
## Building parameters for get_tweets()
if (is.null(data_path) & !bind_tweets) {
stop("Argument (bind_tweets = FALSE) only valid when a data_path is specified.")
}
params <- list(
tweet.fields = "attachments,author_id,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,source,text,withheld",
user.fields = "created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld",
expansions = "author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id",
place.fields = "contained_within,country,country_code,full_name,geo,id,name,place_type"
)
if (context_annotations) {
params[["tweet.fields"]] <- "attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,source,text,withheld"
}
## loop through x in batches of 100 IDs
new_df <- data.frame()
if (length(ids) >= 1) {
n_batches <- ceiling(length(ids) / 100)
} else {
n_batches <- 0
}
endpoint_url <- "https://api.twitter.com/2/tweets"
for (i in seq_len(n_batches)) {
batch <- ids[((i-1)*100+1):min(length(ids),(i*100))]
params[["ids"]] <- paste0(batch, collapse = ",")

## Get tweets
.vcat(verbose, "Batch", i, "out of", ceiling(length(ids) / 100),": ids", utils::head(batch, n = 1), "to", utils::tail(batch, n = 1), "\n")
## new_rows <- get_tweets(params = params, endpoint_url = endpoint_url, n = Inf, file = NULL, bearer_token = bearer_token,
## export_query = FALSE, data_path = data_path, bind_tweets = bind_tweets, verbose = verbose, errors = errors)
new_rows <- get_tweets(params = params, endpoint_url = endpoint_url, n = Inf, file = NULL, bearer_token = bearer_token,
export_query = FALSE, data_path = data_path, bind_tweets = bind_tweets, verbose = FALSE)

if (bind_tweets) {
## if (errors){
## .vcat(verbose, "Retrieved", nrow(dplyr::filter(new_rows, is.na(error))), "out of", length(batch), "\n" ,
## "Errors:", nrow(dplyr::filter(new_rows, !is.na(error))), "\n" )
## } else {
.vcat(verbose, "Retrieved", nrow(new_rows), "out of", length(batch), "\n")
## }

## new_rows$from_tweet_id <- batch[batch %in% new_rows$id]
if (nrow(new_rows) > 0) {
new_df <- dplyr::bind_rows(new_df, new_rows) # add new rows
}

## if (errors) {
## .vcat(verbose, "Total Tweets:", nrow(dplyr::filter(new_df, is.na(error))), "\n")
## } else {
.vcat(verbose, "Total Tweets:", nrow(new_df), "\n")
## }
## }
## if (errors) {
## .vcat(verbose, "Total of", nrow(dplyr::filter(new_df, is.na(error))), "out of", length(ids), "tweets retrieved.\n")
## }
}
}
if (bind_tweets) {
return(new_df)
}
return(invisible(data_path))
}
42 changes: 42 additions & 0 deletions man/hydrate_tweets.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file added tests/testdata/fff_de.RDS
Binary file not shown.
35 changes: 35 additions & 0 deletions tests/testthat/api.twitter.com/2/tweets-2b14f0.R

Large diffs are not rendered by default.

35 changes: 35 additions & 0 deletions tests/testthat/api.twitter.com/2/tweets-302f98.R

Large diffs are not rendered by default.

35 changes: 35 additions & 0 deletions tests/testthat/api.twitter.com/2/tweets-7486fa.R

Large diffs are not rendered by default.

35 changes: 35 additions & 0 deletions tests/testthat/api.twitter.com/2/tweets-80de3f.R

Large diffs are not rendered by default.

35 changes: 35 additions & 0 deletions tests/testthat/api.twitter.com/2/tweets-9df0f5.R

Large diffs are not rendered by default.

35 changes: 35 additions & 0 deletions tests/testthat/api.twitter.com/2/tweets-c4000b.R

Large diffs are not rendered by default.

35 changes: 35 additions & 0 deletions tests/testthat/api.twitter.com/2/tweets-e1624c.R

Large diffs are not rendered by default.

35 changes: 35 additions & 0 deletions tests/testthat/api.twitter.com/2/tweets-e5b077.R

Large diffs are not rendered by default.

35 changes: 35 additions & 0 deletions tests/testthat/api.twitter.com/2/tweets-f81159.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
structure(list(url = "https://api.twitter.com/2/tweets?tweet.fields=attachments%2Cauthor_id%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Cpossibly_sensitive%2Creferenced_tweets%2Csource%2Ctext%2Cwithheld&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&expansions=author_id%2Centities.mentions.username%2Cgeo.place_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&ids=1266876474440761346",
status_code = 200L, headers = structure(list(date = "Thu, 16 Dec 2021 16:37:21 UTC",
server = "tsa_o", `api-version` = "2.32", `content-type` = "application/json; charset=utf-8",
`cache-control` = "no-cache, no-store, max-age=0", `content-length` = "943",
`x-access-level` = "read", `x-frame-options` = "SAMEORIGIN",
`content-encoding` = "gzip", `x-xss-protection` = "0",
`x-rate-limit-limit` = "300", `x-rate-limit-reset` = "1639673360",
`content-disposition` = "attachment; filename=json.json",
`x-content-type-options` = "nosniff", `x-rate-limit-remaining` = "295",
`strict-transport-security` = "max-age=631138519", `x-response-time` = "235",
`x-connection-hash` = "5496ad10d9eafbe645adefab538b1910ec789fbddb69cce5e9a87564645c0780"), class = c("insensitive",
"list")), all_headers = list(list(status = 200L, version = "HTTP/2",
headers = structure(list(date = "Thu, 16 Dec 2021 16:37:21 UTC",
server = "tsa_o", `api-version` = "2.32", `content-type` = "application/json; charset=utf-8",
`cache-control` = "no-cache, no-store, max-age=0",
`content-length` = "943", `x-access-level` = "read",
`x-frame-options` = "SAMEORIGIN", `content-encoding` = "gzip",
`x-xss-protection` = "0", `x-rate-limit-limit` = "300",
`x-rate-limit-reset` = "1639673360", `content-disposition` = "attachment; filename=json.json",
`x-content-type-options` = "nosniff", `x-rate-limit-remaining` = "295",
`strict-transport-security` = "max-age=631138519",
`x-response-time` = "235", `x-connection-hash` = "5496ad10d9eafbe645adefab538b1910ec789fbddb69cce5e9a87564645c0780"), class = c("insensitive",
"list")))), cookies = structure(list(domain = c(".twitter.com",
".twitter.com", ".twitter.com", ".twitter.com"), flag = c(TRUE,
TRUE, TRUE, TRUE), path = c("/", "/", "/", "/"), secure = c(TRUE,
TRUE, TRUE, TRUE), expiration = structure(c(1702744284, 1702744284,
1702744284, 1702744284), class = c("POSIXct", "POSIXt")),
name = c("guest_id_marketing", "guest_id_ads", "personalization_id",
"guest_id"), value = c("REDACTED", "REDACTED", "REDACTED",
"REDACTED")), row.names = c(NA, -4L), class = "data.frame"),
content = charToRaw("{\"data\":[{\"lang\":\"de\",\"conversation_id\":\"1266876474440761346\",\"author_id\":\"282135623\",\"text\":\"diese Petition mitzeichnen!!\\n. \\nKonsequente Ausrichtung eines zukünftigen Konjunkturpakets.\\nNur soziale und technische Innovationen, #Klimaschutz und gesetzlich verankerte Gemeinwohlorientierung machen Deutschland zukunftsfähig.\\n#Datteln4\\n#FridaysForFuture\\nhttps://t.co/eZ8j61yftj\",\"public_metrics\":{\"retweet_count\":2,\"reply_count\":0,\"like_count\":3,\"quote_count\":0},\"possibly_sensitive\":false,\"created_at\":\"2020-05-30T23:37:43.000Z\",\"entities\":{\"urls\":[{\"start\":257,\"end\":280,\"url\":\"https://t.co/eZ8j61yftj\",\"expanded_url\":\"https://epetitionen.bundestag.de/petitionen/_2020/_04/_21/Petition_110043.%24%24%24.a.u.html\",\"display_url\":\"epetitionen.bundestag.de/petitionen/_20…\"}],\"hashtags\":[{\"start\":133,\"end\":145,\"tag\":\"Klimaschutz\"},{\"start\":229,\"end\":238,\"tag\":\"Datteln4\"},{\"start\":239,\"end\":256,\"tag\":\"FridaysForFuture\"}]},\"source\":\"Twitter for Android\",\"id\":\"1266876474440761346\"}],\"includes\":{\"users\":[{\"pinned_tweet_id\":\"1469634511591944194\",\"public_metrics\":{\"followers_count\":3373,\"following_count\":3311,\"tweet_count\":52294,\"listed_count\":0},\"protected\":false,\"name\":\"W. Heinrich ✌️\",\"created_at\":\"2011-04-14T16:25:59.000Z\",\"username\":\"1xKlaudius\",\"profile_image_url\":\"https://pbs.twimg.com/profile_images/1336439514785574915/yo0Fgf8u_normal.jpg\",\"verified\":false,\"location\":\"Franken / Bayern\",\"id\":\"282135623\",\"url\":\"\",\"description\":\"Kritisch - politisch Bio.... LinksGrün versifft... das ist gut so!\\nFalls jemand fragt...? \\nIch bin Radelfahrer bei jeder Gelegenheit und\\nschon lange Autofrei.\"}]}}"),
date = structure(1639672641, class = c("POSIXct", "POSIXt"
), tzone = "GMT"), times = c(redirect = 0, namelookup = 2.8e-05,
connect = 2.9e-05, pretransfer = 0.000109, starttransfer = 0.251372,
total = 0.251693)), class = "response")
85 changes: 85 additions & 0 deletions tests/testthat/test-hydrate.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
## These tests are using cases where ids are all valid.

test_that("Corner cases", {
expect_error(capture_warnings(hydrate_tweets()))
expect_error(capture_warnings(hydrate_tweets(c())), NA)
capture_warnings(res <- hydrate_tweets(c()))
expect_equal(nrow(res), 0)
expect_equal(class(res), "data.frame")
})

## require(httptest)
## start_capturing(simplify = FALSE)
## fff <- readRDS("../testdata/fff_de.RDS")
## hydrate_tweets(fff, verbose = FALSE)
## stop_capturing()

with_mock_api({
test_that("normal case: fff de", {
skip_if(!dir.exists("api.twitter.com"))
fff <- readRDS("../testdata/fff_de.RDS")
expect_error(res <- hydrate_tweets(fff, verbose = FALSE), NA)
expect_equal(nrow(res), length(fff))
})
test_that("normal case: verbose", {
skip_if(!dir.exists("api.twitter.com"))
fff <- readRDS("../testdata/fff_de.RDS")
expect_silent(res <- hydrate_tweets(fff, verbose = FALSE))
expect_output(capture_warnings(res <- hydrate_tweets(fff, verbose = TRUE)))
})
test_that("normal case: bind_tweets", {
skip_if(!dir.exists("api.twitter.com"))
fff <- readRDS("../testdata/fff_de.RDS")
emptydir <- academictwitteR:::.gen_random_dir()
expect_silent(res <- hydrate_tweets(fff, verbose = FALSE, data_path = emptydir))
expect_true(length(list.files(emptydir, "json$")) > 0)
expect_error(z <- bind_tweets(emptydir, verbose = FALSE), NA)
expect_equal(length(fff), nrow(z))
unlink(emptydir, recursive = TRUE)
emptydir <- academictwitteR:::.gen_random_dir()
expect_silent(capture_warnings(res <- hydrate_tweets(fff, verbose = FALSE, data_path = emptydir, bind_tweets = FALSE)))
## error when data_path is null and bind_tweets is FALSE
## the same expected behavior to `get_all_tweets`
expect_error(capture_warnings(res <- hydrate_tweets(fff, data_path = NULL, bind_tweets = FALSE)))
})
})

## require(httptest)
## start_capturing(simplify = FALSE)
## fff <- readRDS("../testdata/fff_de.RDS")
## hydrate_tweets(fff, verbose = FALSE, context_annotations = TRUE)
## stop_capturing()

with_mock_api({
test_that("normal case: context_anntations", {
skip_if(!dir.exists("api.twitter.com"))
fff <- readRDS("../testdata/fff_de.RDS")
ca1 <- hydrate_tweets(fff, verbose = FALSE, context_annotations = TRUE)
expect_true("context_annotations" %in% colnames(ca1))
expect_equal(nrow(ca1), length(fff))
ca0 <- hydrate_tweets(fff, verbose = FALSE, context_annotations = FALSE)
expect_false("context_annotations" %in% colnames(ca0))
expect_equal(nrow(ca0), length(fff))
})
})

## require(httptest)
## start_capturing(simplify = FALSE)
## fff <- readRDS("../testdata/fff_de.RDS")
## manyf <- c(fff, fff, fff)
## for (i in c(1, 99, 100, 199, 200, 250)) {
## hydrate_tweets(manyf[seq_len(i)], verbose = FALSE)
## }
## stop_capturing()

with_mock_api({
test_that("normal case: different sizes", {
skip_if(!dir.exists("api.twitter.com"))
fff <- readRDS("../testdata/fff_de.RDS")
manyf <- c(fff, fff, fff)
for (i in c(1, 99, 100, 199, 200, 250)) {
expect_error(res <- hydrate_tweets(manyf[seq_len(i)], verbose = FALSE), NA)
expect_equal(nrow(res), i)
}
})
})

0 comments on commit 85f82a8

Please sign in to comment.