Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Keep download state and do not attempt to redownload images and user handles #83

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 35 additions & 17 deletions parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,10 @@ def get_twitter_api_guest_token(session, bearer_token):
return guest_token


def get_twitter_users(session, bearer_token, guest_token, user_ids):
def get_twitter_users(session, bearer_token, guest_token, user_ids, state):
"""Asks Twitter for all metadata associated with user_ids."""
users = {}
[user_ids.remove(id) for id in user_ids if id in state]
while user_ids:
max_batch = 100
user_id_batch = user_ids[:max_batch]
Expand All @@ -89,10 +90,11 @@ def get_twitter_users(session, bearer_token, guest_token, user_ids):
response_json = json.loads(response.content)
for user in response_json:
users[user["id_str"]] = user
return users
state.update(users)
return state


def lookup_users(user_ids, users):
def lookup_users(user_ids, users, state):
"""Fill the users dictionary with data from Twitter"""
# Filter out any users already known
filtered_user_ids = [id for id in user_ids if id not in users]
Expand All @@ -110,7 +112,7 @@ def lookup_users(user_ids, users):
with requests.Session() as session:
Copy link
Owner

@timhutton timhutton Nov 24, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

print(f'{len(filtered_user_ids)} users are unknown.')

This line is now misleading because many of these are in state and will get filtered out in get_twitter_users(). Can we do the filtering earlier, so that we can tell the users how many handles we need to download?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As we're removing the duplication of users = {}, sure. We can do the filtering earlier... Also saves us from passing the dict pointer three layers deep...

bearer_token = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
guest_token = get_twitter_api_guest_token(session, bearer_token)
retrieved_users = get_twitter_users(session, bearer_token, guest_token, filtered_user_ids)
retrieved_users = get_twitter_users(session, bearer_token, guest_token, filtered_user_ids, state)
for user_id, user in retrieved_users.items():
users[user_id] = UserData(user_id, user["screen_name"])
except Exception as err:
Expand Down Expand Up @@ -362,7 +364,7 @@ def download_file_if_larger(url, filename, index, count, sleep_time):
return False, 0


def download_larger_media(media_sources, log_path):
def download_larger_media(media_sources, log_path, state):
"""Uses (filename, URL) tuples in media_sources to download files from remote storage.
Aborts downloads if the remote file is the same size or smaller than the existing local version.
Retries the failed downloads several times, with increasing pauses between each to avoid being blocked.
Expand All @@ -382,7 +384,13 @@ def download_larger_media(media_sources, log_path):
success_count = 0
retries = []
for index, (local_media_path, media_url) in enumerate(media_sources):
success, bytes_downloaded = download_file_if_larger(media_url, local_media_path, index + 1, number_of_files, sleep_time)
if state.get(media_url, {}).get('success'):
logging.info(f'{index + 1:3d}/{number_of_files:3d} {local_media_path}: SKIPPED. File already successfully fetched. Not attempting to download.')
success = state.get(media_url, {}).get('success', False)
bytes_downloaded = state.get(media_url, {}).get('bytes_downloaded', 0)
else:
success, bytes_downloaded = download_file_if_larger(media_url, local_media_path, index + 1, number_of_files, sleep_time)
state.update({media_url: {"local": local_media_path, "success": success, "downloaded": bytes_downloaded}})
if success:
success_count += 1
else:
Expand Down Expand Up @@ -444,7 +452,7 @@ def parse_tweets(input_filenames, username, users, html_template, archive_media_
return media_sources


def parse_followings(data_folder, users, user_id_URL_template, output_following_filename):
def parse_followings(data_folder, users, user_id_URL_template, output_following_filename, state):
"""Parse data_folder/following.js, write to output_following_filename.
Query Twitter API for the missing user handles, if the user agrees.
"""
Expand All @@ -454,7 +462,7 @@ def parse_followings(data_folder, users, user_id_URL_template, output_following_
for follow in following_json:
if 'following' in follow and 'accountId' in follow['following']:
following_ids.append(follow['following']['accountId'])
lookup_users(following_ids, users)
lookup_users(following_ids, users, state)
for id in following_ids:
handle = users[id].handle if id in users else '~unknown~handle~'
following.append(handle + ' ' + user_id_URL_template.format(id))
Expand All @@ -464,7 +472,7 @@ def parse_followings(data_folder, users, user_id_URL_template, output_following_
print(f"Wrote {len(following)} accounts to {output_following_filename}")


def parse_followers(data_folder, users, user_id_URL_template, output_followers_filename):
def parse_followers(data_folder, users, user_id_URL_template, output_followers_filename, state):
"""Parse data_folder/followers.js, write to output_followers_filename.
Query Twitter API for the missing user handles, if the user agrees.
"""
Expand All @@ -474,7 +482,7 @@ def parse_followers(data_folder, users, user_id_URL_template, output_followers_f
for follower in follower_json:
if 'follower' in follower and 'accountId' in follower['follower']:
follower_ids.append(follower['follower']['accountId'])
lookup_users(follower_ids, users)
lookup_users(follower_ids, users, state)
for id in follower_ids:
handle = users[id].handle if id in users else '~unknown~handle~'
followers.append(handle + ' ' + user_id_URL_template.format(id))
Expand All @@ -490,7 +498,7 @@ def chunks(lst: list, n: int):
yield lst[i:i + n]


def parse_direct_messages(data_folder, username, users, user_id_url_template, dm_output_filename_template):
def parse_direct_messages(data_folder, username, users, user_id_url_template, dm_output_filename_template, state):
"""Parse data_folder/direct-messages.js, write to one markdown file per conversation.
Query Twitter API for the missing user handles, if the user agrees.
"""
Expand All @@ -504,7 +512,7 @@ def parse_direct_messages(data_folder, username, users, user_id_url_template, dm
user1_id, user2_id = conversation_id.split('-')
dm_user_ids.add(user1_id)
dm_user_ids.add(user2_id)
lookup_users(list(dm_user_ids), users)
lookup_users(list(dm_user_ids), users, state)

# Parse the DMs and store the messages in a dict
conversations_messages = defaultdict(list)
Expand Down Expand Up @@ -598,6 +606,7 @@ def main():
data_folder = os.path.join(input_folder, 'data')
account_js_filename = os.path.join(data_folder, 'account.js')
log_path = os.path.join(output_media_folder_name, 'download_log.txt')
state_path = 'download_state.json'
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move to PathConfig and use cache if we agree on that.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI: the class PathConfig has been merged into main today via PR #115, but it does not have a path for cache yet.

I have just now published PR draft #120 which shall solve #99, and it introduces PathConfig.dir_output_cache, which is only used for a single file there.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR #120 is merged now, so you can use state_path = os.path.join(paths.dir_output_cache, 'download_state.json')

output_following_filename = 'following.txt'
output_followers_filename = 'followers.txt'
user_id_URL_template = 'https://twitter.com/i/user/{}'
Expand All @@ -623,6 +632,13 @@ def main():

users = {}
Copy link
Owner

@timhutton timhutton Nov 24, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

state["users"] seems to duplicate users?

I like the idea of have a single cache mechanism for users, media, unshortened URLs, so let's delete users. Unless I've missed something?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does not explicitly duplicate users but in effect it does.
Good point removing that.


# Use our state store to prevent duplicate downloads
try:
with open(state_path, 'r') as state_file:
state = json.load(state_file)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

state is a very general word. Would cache be better?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When it started out, it was only about keeping download state as the downloaded images were already on disk.
So we were not really caching any data.

But now that we're actually handling user data, sure. We can relabel to cache.

except (IOError, json.decoder.JSONDecodeError):
state = {"media": {}, "users": {}}

# Extract the username from data/account.js
if not os.path.isfile(account_js_filename):
print(f'Error: Failed to load {account_js_filename}. Start this script in the root folder of your Twitter archive.')
Expand All @@ -639,21 +655,23 @@ def main():

media_sources = parse_tweets(input_filenames, username, users, html_template, archive_media_folder,
output_media_folder_name, tweet_icon_path, output_html_filename)
parse_followings(data_folder, users, user_id_URL_template, output_following_filename)
parse_followers(data_folder, users, user_id_URL_template, output_followers_filename)
parse_direct_messages(data_folder, username, users, user_id_URL_template, dm_output_filename_template)
parse_followings(data_folder, users, user_id_URL_template, output_following_filename, state["users"])
parse_followers(data_folder, users, user_id_URL_template, output_followers_filename, state["users"])
parse_direct_messages(data_folder, username, users, user_id_URL_template, dm_output_filename_template, state["users"])

# Download larger images, if the user agrees
print(f"\nThe archive doesn't contain the original-size images. We can attempt to download them from twimg.com.")
print(f'Please be aware that this script may download a lot of data, which will cost you money if you are')
print(f'paying for bandwidth. Please be aware that the servers might block these requests if they are too')
print(f'frequent. This script may not work if your account is protected. You may want to set it to public')
print(f'before starting the download.')
user_input = input('\nOK to start downloading? [y/n]')
user_input = input('\nOK to start downloading? [y/N]')
if user_input.lower() in ('y', 'yes'):
download_larger_media(media_sources, log_path)
download_larger_media(media_sources, log_path, state["media"])
print('In case you set your account to public before initiating the download, do not forget to protect it again.')

with open(state_path, 'w') as state_file:
json.dump(state, state_file, sort_keys=True, indent=4)

if __name__ == "__main__":
main()