This repository was archived by the owner on Jun 30, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 59
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Update utils.R Added an option to retrieve erroneous tweets with their error title. Changes are made in get_tweets() and df_to_json() in order to bind the erroneous tweets to the bound dataframe and make error_-json files in the data_path * Create tweets_lookup Function to access the API v2 Tweets Lookup (https://developer.twitter.com/en/docs/twitter-api/tweets/lookup/introduction) function. This is useful when looking to rehydrate tweets via their tweet IDs. An option to catch errors in the lookup process is added (see changes to utils for implementation) * Update tweets_lookup added total success overview at the end of the loop * Make runnable; pass all old unit tests * Add Tim König a contributor * Exclude the `errors` part for now (It will be for another PR) * Add documentation and rename `tweets_lookup` to `hydrate_tweets` * Add tests for corner cases * Add basic test cases * Clear check messages / errors * Correct documentation of `hydrate_tweets` on `context_annotations` * Update hydrate_tweets.R minor adjustment to make sure the reporting doesn't break the function when bind_tweets == F ("Retrieved" and "Total Tweets" verbose messages rely on the new_rows object which is only made if tweets are bound) * Reduce some verbosity Co-authored-by: chainsawriot <[email protected]>
- Loading branch information
1 parent
352e359
commit 85f82a8
Showing
15 changed files
with
532 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,7 +21,11 @@ Authors@R: | |
family = "Rico", | ||
role = c("ctb"), | ||
email = "[email protected]", | ||
comment = c(ORCID = "0000-0002-6169-4523"))) | ||
comment = c(ORCID = "0000-0002-6169-4523")), | ||
person(given = "Tim", | ||
family = "König", | ||
role = c("ctb"), | ||
comment = c(ORCID = "0000-0002-2852-2690"))) | ||
Description: Package to query the Twitter Academic Research Product Track, | ||
providing access to full-archive search and other v2 API endpoints. Functions | ||
are written with academic research in mind. They provide flexibility in how | ||
|
@@ -48,7 +52,8 @@ Imports: | |
tidyr, | ||
tidyselect, | ||
purrr, | ||
rlang | ||
rlang, | ||
utils | ||
Suggests: | ||
knitr, | ||
rmarkdown, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
#' Hydrate Tweets Based On Tweet IDs | ||
#' | ||
#' This function is helpful for hydrating Tweet IDs (i.e. getting the full content of tweets from a list of Tweet IDs). | ||
#' @inheritParams get_all_tweets | ||
#' @param ids a character vector of Tweet IDs | ||
#' @param context_annotations If `TRUE`, context_annotations will be fetched. | ||
#' @return When bind_tweets is `TRUE`, the function returns a data frame. The `data_path` if `bind_tweets` is `FALSE` | ||
#' @examples | ||
#' \dontrun{ | ||
#' hydrate_tweets(c("1266876474440761346", "1266868259925737474", "1266867327079002121", | ||
#' "1266866660713127936", "1266864490446012418", "1266860737244336129", | ||
#' "1266859737615826944", "1266859455586676736", "1266858090143588352", | ||
#' "1266857669157097473")) | ||
#' } | ||
#' @export | ||
hydrate_tweets <- function(ids, bearer_token = get_bearer(), data_path = NULL, | ||
context_annotations = FALSE, | ||
bind_tweets = TRUE, | ||
verbose = TRUE) { | ||
## verbose = TRUE, | ||
## errors = FALSE) { | ||
## Building parameters for get_tweets() | ||
if (is.null(data_path) & !bind_tweets) { | ||
stop("Argument (bind_tweets = FALSE) only valid when a data_path is specified.") | ||
} | ||
params <- list( | ||
tweet.fields = "attachments,author_id,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,source,text,withheld", | ||
user.fields = "created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld", | ||
expansions = "author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id", | ||
place.fields = "contained_within,country,country_code,full_name,geo,id,name,place_type" | ||
) | ||
if (context_annotations) { | ||
params[["tweet.fields"]] <- "attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,source,text,withheld" | ||
} | ||
## loop through x in batches of 100 IDs | ||
new_df <- data.frame() | ||
if (length(ids) >= 1) { | ||
n_batches <- ceiling(length(ids) / 100) | ||
} else { | ||
n_batches <- 0 | ||
} | ||
endpoint_url <- "https://api.twitter.com/2/tweets" | ||
for (i in seq_len(n_batches)) { | ||
batch <- ids[((i-1)*100+1):min(length(ids),(i*100))] | ||
params[["ids"]] <- paste0(batch, collapse = ",") | ||
|
||
## Get tweets | ||
.vcat(verbose, "Batch", i, "out of", ceiling(length(ids) / 100),": ids", utils::head(batch, n = 1), "to", utils::tail(batch, n = 1), "\n") | ||
## new_rows <- get_tweets(params = params, endpoint_url = endpoint_url, n = Inf, file = NULL, bearer_token = bearer_token, | ||
## export_query = FALSE, data_path = data_path, bind_tweets = bind_tweets, verbose = verbose, errors = errors) | ||
new_rows <- get_tweets(params = params, endpoint_url = endpoint_url, n = Inf, file = NULL, bearer_token = bearer_token, | ||
export_query = FALSE, data_path = data_path, bind_tweets = bind_tweets, verbose = FALSE) | ||
|
||
if (bind_tweets) { | ||
## if (errors){ | ||
## .vcat(verbose, "Retrieved", nrow(dplyr::filter(new_rows, is.na(error))), "out of", length(batch), "\n" , | ||
## "Errors:", nrow(dplyr::filter(new_rows, !is.na(error))), "\n" ) | ||
## } else { | ||
.vcat(verbose, "Retrieved", nrow(new_rows), "out of", length(batch), "\n") | ||
## } | ||
|
||
## new_rows$from_tweet_id <- batch[batch %in% new_rows$id] | ||
if (nrow(new_rows) > 0) { | ||
new_df <- dplyr::bind_rows(new_df, new_rows) # add new rows | ||
} | ||
|
||
## if (errors) { | ||
## .vcat(verbose, "Total Tweets:", nrow(dplyr::filter(new_df, is.na(error))), "\n") | ||
## } else { | ||
.vcat(verbose, "Total Tweets:", nrow(new_df), "\n") | ||
## } | ||
## } | ||
## if (errors) { | ||
## .vcat(verbose, "Total of", nrow(dplyr::filter(new_df, is.na(error))), "out of", length(ids), "tweets retrieved.\n") | ||
## } | ||
} | ||
} | ||
if (bind_tweets) { | ||
return(new_df) | ||
} | ||
return(invisible(data_path)) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
structure(list(url = "https://api.twitter.com/2/tweets?tweet.fields=attachments%2Cauthor_id%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Cpossibly_sensitive%2Creferenced_tweets%2Csource%2Ctext%2Cwithheld&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&expansions=author_id%2Centities.mentions.username%2Cgeo.place_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&ids=1266876474440761346", | ||
status_code = 200L, headers = structure(list(date = "Thu, 16 Dec 2021 16:37:21 UTC", | ||
server = "tsa_o", `api-version` = "2.32", `content-type` = "application/json; charset=utf-8", | ||
`cache-control` = "no-cache, no-store, max-age=0", `content-length` = "943", | ||
`x-access-level` = "read", `x-frame-options` = "SAMEORIGIN", | ||
`content-encoding` = "gzip", `x-xss-protection` = "0", | ||
`x-rate-limit-limit` = "300", `x-rate-limit-reset` = "1639673360", | ||
`content-disposition` = "attachment; filename=json.json", | ||
`x-content-type-options` = "nosniff", `x-rate-limit-remaining` = "295", | ||
`strict-transport-security` = "max-age=631138519", `x-response-time` = "235", | ||
`x-connection-hash` = "5496ad10d9eafbe645adefab538b1910ec789fbddb69cce5e9a87564645c0780"), class = c("insensitive", | ||
"list")), all_headers = list(list(status = 200L, version = "HTTP/2", | ||
headers = structure(list(date = "Thu, 16 Dec 2021 16:37:21 UTC", | ||
server = "tsa_o", `api-version` = "2.32", `content-type` = "application/json; charset=utf-8", | ||
`cache-control` = "no-cache, no-store, max-age=0", | ||
`content-length` = "943", `x-access-level` = "read", | ||
`x-frame-options` = "SAMEORIGIN", `content-encoding` = "gzip", | ||
`x-xss-protection` = "0", `x-rate-limit-limit` = "300", | ||
`x-rate-limit-reset` = "1639673360", `content-disposition` = "attachment; filename=json.json", | ||
`x-content-type-options` = "nosniff", `x-rate-limit-remaining` = "295", | ||
`strict-transport-security` = "max-age=631138519", | ||
`x-response-time` = "235", `x-connection-hash` = "5496ad10d9eafbe645adefab538b1910ec789fbddb69cce5e9a87564645c0780"), class = c("insensitive", | ||
"list")))), cookies = structure(list(domain = c(".twitter.com", | ||
".twitter.com", ".twitter.com", ".twitter.com"), flag = c(TRUE, | ||
TRUE, TRUE, TRUE), path = c("/", "/", "/", "/"), secure = c(TRUE, | ||
TRUE, TRUE, TRUE), expiration = structure(c(1702744284, 1702744284, | ||
1702744284, 1702744284), class = c("POSIXct", "POSIXt")), | ||
name = c("guest_id_marketing", "guest_id_ads", "personalization_id", | ||
"guest_id"), value = c("REDACTED", "REDACTED", "REDACTED", | ||
"REDACTED")), row.names = c(NA, -4L), class = "data.frame"), | ||
content = charToRaw("{\"data\":[{\"lang\":\"de\",\"conversation_id\":\"1266876474440761346\",\"author_id\":\"282135623\",\"text\":\"diese Petition mitzeichnen!!\\n. \\nKonsequente Ausrichtung eines zukünftigen Konjunkturpakets.\\nNur soziale und technische Innovationen, #Klimaschutz und gesetzlich verankerte Gemeinwohlorientierung machen Deutschland zukunftsfähig.\\n#Datteln4\\n#FridaysForFuture\\nhttps://t.co/eZ8j61yftj\",\"public_metrics\":{\"retweet_count\":2,\"reply_count\":0,\"like_count\":3,\"quote_count\":0},\"possibly_sensitive\":false,\"created_at\":\"2020-05-30T23:37:43.000Z\",\"entities\":{\"urls\":[{\"start\":257,\"end\":280,\"url\":\"https://t.co/eZ8j61yftj\",\"expanded_url\":\"https://epetitionen.bundestag.de/petitionen/_2020/_04/_21/Petition_110043.%24%24%24.a.u.html\",\"display_url\":\"epetitionen.bundestag.de/petitionen/_20…\"}],\"hashtags\":[{\"start\":133,\"end\":145,\"tag\":\"Klimaschutz\"},{\"start\":229,\"end\":238,\"tag\":\"Datteln4\"},{\"start\":239,\"end\":256,\"tag\":\"FridaysForFuture\"}]},\"source\":\"Twitter for Android\",\"id\":\"1266876474440761346\"}],\"includes\":{\"users\":[{\"pinned_tweet_id\":\"1469634511591944194\",\"public_metrics\":{\"followers_count\":3373,\"following_count\":3311,\"tweet_count\":52294,\"listed_count\":0},\"protected\":false,\"name\":\"W. Heinrich ✌️\",\"created_at\":\"2011-04-14T16:25:59.000Z\",\"username\":\"1xKlaudius\",\"profile_image_url\":\"https://pbs.twimg.com/profile_images/1336439514785574915/yo0Fgf8u_normal.jpg\",\"verified\":false,\"location\":\"Franken / Bayern\",\"id\":\"282135623\",\"url\":\"\",\"description\":\"Kritisch - politisch Bio.... LinksGrün versifft... das ist gut so!\\nFalls jemand fragt...? \\nIch bin Radelfahrer bei jeder Gelegenheit und\\nschon lange Autofrei.\"}]}}"), | ||
date = structure(1639672641, class = c("POSIXct", "POSIXt" | ||
), tzone = "GMT"), times = c(redirect = 0, namelookup = 2.8e-05, | ||
connect = 2.9e-05, pretransfer = 0.000109, starttransfer = 0.251372, | ||
total = 0.251693)), class = "response") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
## These tests are using cases where ids are all valid. | ||
|
||
test_that("Corner cases", { | ||
expect_error(capture_warnings(hydrate_tweets())) | ||
expect_error(capture_warnings(hydrate_tweets(c())), NA) | ||
capture_warnings(res <- hydrate_tweets(c())) | ||
expect_equal(nrow(res), 0) | ||
expect_equal(class(res), "data.frame") | ||
}) | ||
|
||
## require(httptest) | ||
## start_capturing(simplify = FALSE) | ||
## fff <- readRDS("../testdata/fff_de.RDS") | ||
## hydrate_tweets(fff, verbose = FALSE) | ||
## stop_capturing() | ||
|
||
with_mock_api({ | ||
test_that("normal case: fff de", { | ||
skip_if(!dir.exists("api.twitter.com")) | ||
fff <- readRDS("../testdata/fff_de.RDS") | ||
expect_error(res <- hydrate_tweets(fff, verbose = FALSE), NA) | ||
expect_equal(nrow(res), length(fff)) | ||
}) | ||
test_that("normal case: verbose", { | ||
skip_if(!dir.exists("api.twitter.com")) | ||
fff <- readRDS("../testdata/fff_de.RDS") | ||
expect_silent(res <- hydrate_tweets(fff, verbose = FALSE)) | ||
expect_output(capture_warnings(res <- hydrate_tweets(fff, verbose = TRUE))) | ||
}) | ||
test_that("normal case: bind_tweets", { | ||
skip_if(!dir.exists("api.twitter.com")) | ||
fff <- readRDS("../testdata/fff_de.RDS") | ||
emptydir <- academictwitteR:::.gen_random_dir() | ||
expect_silent(res <- hydrate_tweets(fff, verbose = FALSE, data_path = emptydir)) | ||
expect_true(length(list.files(emptydir, "json$")) > 0) | ||
expect_error(z <- bind_tweets(emptydir, verbose = FALSE), NA) | ||
expect_equal(length(fff), nrow(z)) | ||
unlink(emptydir, recursive = TRUE) | ||
emptydir <- academictwitteR:::.gen_random_dir() | ||
expect_silent(capture_warnings(res <- hydrate_tweets(fff, verbose = FALSE, data_path = emptydir, bind_tweets = FALSE))) | ||
## error when data_path is null and bind_tweets is FALSE | ||
## the same expected behavior to `get_all_tweets` | ||
expect_error(capture_warnings(res <- hydrate_tweets(fff, data_path = NULL, bind_tweets = FALSE))) | ||
}) | ||
}) | ||
|
||
## require(httptest) | ||
## start_capturing(simplify = FALSE) | ||
## fff <- readRDS("../testdata/fff_de.RDS") | ||
## hydrate_tweets(fff, verbose = FALSE, context_annotations = TRUE) | ||
## stop_capturing() | ||
|
||
with_mock_api({ | ||
test_that("normal case: context_anntations", { | ||
skip_if(!dir.exists("api.twitter.com")) | ||
fff <- readRDS("../testdata/fff_de.RDS") | ||
ca1 <- hydrate_tweets(fff, verbose = FALSE, context_annotations = TRUE) | ||
expect_true("context_annotations" %in% colnames(ca1)) | ||
expect_equal(nrow(ca1), length(fff)) | ||
ca0 <- hydrate_tweets(fff, verbose = FALSE, context_annotations = FALSE) | ||
expect_false("context_annotations" %in% colnames(ca0)) | ||
expect_equal(nrow(ca0), length(fff)) | ||
}) | ||
}) | ||
|
||
## require(httptest) | ||
## start_capturing(simplify = FALSE) | ||
## fff <- readRDS("../testdata/fff_de.RDS") | ||
## manyf <- c(fff, fff, fff) | ||
## for (i in c(1, 99, 100, 199, 200, 250)) { | ||
## hydrate_tweets(manyf[seq_len(i)], verbose = FALSE) | ||
## } | ||
## stop_capturing() | ||
|
||
with_mock_api({ | ||
test_that("normal case: different sizes", { | ||
skip_if(!dir.exists("api.twitter.com")) | ||
fff <- readRDS("../testdata/fff_de.RDS") | ||
manyf <- c(fff, fff, fff) | ||
for (i in c(1, 99, 100, 199, 200, 250)) { | ||
expect_error(res <- hydrate_tweets(manyf[seq_len(i)], verbose = FALSE), NA) | ||
expect_equal(nrow(res), i) | ||
} | ||
}) | ||
}) |