-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathuser-lookup-usernames.py
189 lines (152 loc) · 5.57 KB
/
user-lookup-usernames.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/env python3
# Script Information
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""
PURPOSE:
- Script to scrape Twitter users account information
with the Twitters V2 user_lookup_usernames endpoint.
INPUT:
- A file of usernames where each line contains one username.
OUTPUT:
- account_data--{todays-date}.json : a file where each line
represents one accounts information.
- account_errors--{todays-date}.json : a file which records any
errors received (one per line). You can then learn why certain ids
were not returned (private, suspended, etc.).
Author: Matthew R. DeVerna
"""
# Import packages
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
import argparse
import os
import json
from datetime import datetime as dt
import osometweet
from osometweet.utils import chunker
# Create Functions.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def parse_cl_args():
"""Set CLI Arguments."""
# Initiate the parser
parser = argparse.ArgumentParser(
description="Script to scrape Twitter users account information."
)
# Add optional arguments
parser.add_argument(
"-f", "--file",
metavar='File',
help="Full path to the file containing the "
"USER IDS you would like to scrape.",
required=True
)
# Read parsed arguments from the command line into "args"
args = parser.parse_args()
# Assign the file name to a variable and return it
usernames_file = args.file
return usernames_file
def load_users(usernames_file):
"""
Load all users, returning a list of lists, each 100
users long.
"""
with open(usernames_file, 'r') as f:
users = [x.strip('\n') for x in f.readlines()]
max_query_length = 100
# This allows us to iterate through a long list of users
# 100 users at a time (which is the maximum number of ids
# we can query Twitter for in one call).
chunked_user_list = chunker(
seq=users,
size=max_query_length
)
return chunked_user_list
def load_keys():
"""
Load Twitter Keys from Local Environment.
# To set your environment variables in your terminal execute a command
# like the one that you see below.
# Example:
# export 'TWITTER_API_KEY'='<your_twitter_api_key>'
# Do this for all of your tokens, and then load them with the commands
# below, matching the string in the .get("string") to the name you've
# chosen to the left of the equal sign above.
# Set Twitter tokens/keys.
"""
access_token = os.environ.get("TWITTER_ACCESS_TOKEN")
access_token_secret = os.environ.get("TWITTER_ACCESS_TOKEN_SECRET")
api_key = os.environ.get("TWITTER_API_KEY")
api_key_secret = os.environ.get("TWITTER_API_KEY_SECRET")
return access_token, access_token_secret, api_key, api_key_secret
def gather_data(
access_token,
access_token_secret,
api_key,
api_key_secret,
chunked_user_list
):
"""
Gather user info based on the chunked list of usernames with the provided
bearer_token.
"""
print("Gathering Data...")
oauth1a = osometweet.OAuth1a(
api_key=api_key,
api_key_secret=api_key_secret,
access_token=access_token,
access_token_secret=access_token_secret
)
ot = osometweet.OsomeTweet(oauth1a)
# Add all user_fields
all_user_fields = osometweet.UserFields(everything=True)
# Get today's date
today = dt.strftime(dt.today(), "%Y-%m-%d_%H-%M")
# Open two files. One for good data, the other for account errors.
with open(f"account_data--{today}.json", 'w') as data_file,\
open(f"account_errors--{today}.json", 'w') as error_file:
# Iterate through the list of lists
for one_hundred_users in chunked_user_list:
response = ot.user_lookup_usernames(
usernames=one_hundred_users,
fields=all_user_fields
)
# Where as the user_ids endpoint always returns both "data" and
# "errors", the username endpoint does the opposite - only
# including these keys if data is present.
if "data" in response:
data = response["data"]
else:
data = None
if "errors" in response:
errors = response["errors"]
else:
errors = None
try:
data_file.writelines(f"{json.dumps(line)}\n" for line in data)
except TypeError:
print(
"No USER data found in this set of users, "
"skipping to the next set."
)
try:
error_file.writelines(
f"{json.dumps(line)}\n" for line in errors
)
except TypeError:
print(
"No problematic users found in this set of user, "
"skipping to the next set."
)
# Execute the program
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if __name__ == "__main__":
usernames_file = parse_cl_args()
chunked_user_list = load_users(usernames_file)
access_token, access_token_secret, api_key, api_key_secret = load_keys()
gather_data(
access_token=access_token,
access_token_secret=access_token_secret,
api_key=api_key,
api_key_secret=api_key_secret,
chunked_user_list=chunked_user_list
)
print("Data pull complete.")