Skip to content

Commit

Permalink
Add importing of media files when importing Tweets from an archive
Browse files Browse the repository at this point in the history
* The downloaded archive includes all the media files associated
  with a user's tweets, so we can import them relatively easily.

* We import the MP4s Twitter users to display animated GIFs and the
  image files for JPGs/PNGs. We don't import video files that were
  uploaded as such because we don't currently include those when
  fetching media files from the API, so this is to remain consistent.

* When we fetch media files for animated GIFs, we fetch both the MP4
  and a JPG of it. Although we have the path for both in the tweet data
  in the archive, only the MP4 is present in the `tweet_media` directory
  so we only import that.

For #229
  • Loading branch information
philgyford committed Feb 14, 2022
1 parent 203797a commit 92688fd
Show file tree
Hide file tree
Showing 11 changed files with 436 additions and 36 deletions.
91 changes: 82 additions & 9 deletions ditto/twitter/ingest.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# coding: utf-8
import json
import os
from urllib.parse import urlparse

from django.core.files import File

from .fetch.savers import TweetSaver
from .models import Media
from ..core.utils import datetime_now


Expand Down Expand Up @@ -37,6 +40,9 @@ def __init__(self):
# How mnay tweets we found in all the files:
self.tweet_count = 0

# How many media files we imported:
self.media_count = 0

# Stores all the imported data from the files before saving.
# So that we know we've got through all the files within JSON errors
# etc before we begin touching the DB.
Expand All @@ -47,19 +53,23 @@ def ingest(self, directory):

self._load_data(directory)

self._save_tweets()
self._save_tweets(directory)

self._save_media(directory)

if self.tweet_count > 0:
return {
"success": True,
"tweets": self.tweet_count,
"files": self.file_count,
"media": self.media_count,
}
else:
return {
"success": False,
"tweets": 0,
"files": self.file_count,
"media": self.media_count,
"messages": ["No tweets were found"],
}

Expand All @@ -78,7 +88,7 @@ def _load_data(self, directory):
"_load_data() method."
)

def _save_tweets(self):
def _save_tweets(self, directory):
"""Go through the list of dicts that is self.tweets_data and
create/update each tweet in the DB.
"""
Expand All @@ -89,6 +99,12 @@ def _save_tweets(self):
TweetSaver().save_tweet(tweet, self.fetch_time)
self.tweet_count += 1

def _save_media(self, directory):
"""Save media files.
Not doing anything by default.
"""
pass


class Version1TweetIngester(TweetIngester):
"""
Expand Down Expand Up @@ -153,6 +169,9 @@ class Version2TweetIngester(TweetIngester):
was introduced sometime between January and May of 2019.
It contains two directories - assets and data - and a "Your archive.html" file.
This not only saves the Tweet objects but also imports media from the
tweet_media directory, saving it as Media files .
"""

def __init__(self):
Expand All @@ -171,11 +190,11 @@ def _load_data(self, directory):

self.user_data = self._construct_user_data(directory)

self.tweets_data = self._get_json_from_file(os.path.join(directory, "tweet.js"))
self.tweets_data = self._get_json_from_file(directory, "tweet.js")

self.file_count = 1

def _save_tweets(self):
def _save_tweets(self, directory):
"""
Save the tweets with our constructed user data.
"""
Expand All @@ -188,18 +207,71 @@ def _save_tweets(self):
TweetSaver().save_tweet(tweet["tweet"], self.fetch_time, self.user_data)
self.tweet_count += 1

def _save_media(self, directory):
"""
Save any animated gif's mp4 or an image's file for the saved tweets.
"""

for t in self.tweets_data:
tweet = t["tweet"]

if "extended_entities" in tweet and "media" in tweet["extended_entities"]:
for item in tweet["extended_entities"]["media"]:
try:
media_obj = Media.objects.get(twitter_id=int(item["id"]))
except Media.DoesNotExist:
pass
else:
if (
media_obj.media_type != "video"
and media_obj.has_file is False
):
# We don't save video files - only image files, and mp4s for # GIFs - and only want to do this if we don't already have a
# file.

if (
media_obj.media_type == "animated_gif"
and media_obj.mp4_url
):
url = media_obj.mp4_url
elif (
media_obj.media_type == "photo" and media_obj.image_url
):
url = media_obj.image_url

if url:
# Work out name of file in the tweet_media directory:
parsed_url = urlparse(url)
filename = os.path.basename(parsed_url.path)
local_filename = f"{tweet['id_str']}-{filename}"
filepath = os.path.join(
directory, "tweet_media", local_filename
)

django_file = File(open(filepath, "rb"))

if media_obj.media_type == "animated_gif":
# When we fetch GIFs we also fetch an image file for
# them. But their images aren't included in the
# downloaded archive so we'll make do without here.
media_obj.mp4_file.save(filename, django_file)
self.media_count += 1
elif media_obj.media_type == "photo":
media_obj.image_file.save(filename, django_file)
self.media_count += 1

def _construct_user_data(self, directory):
"""
Make a single dict of data about a user like we'd get from the API.
This data is in several separate files in the download so we need to
piece it together from those.
"""

account_data = self._get_json_from_file(os.path.join(directory, "account.js"))
account_data = self._get_json_from_file(directory, "account.js")

profile_data = self._get_json_from_file(os.path.join(directory, "profile.js"))
profile_data = self._get_json_from_file(directory, "profile.js")

verified_data = self._get_json_from_file(os.path.join(directory, "verified.js"))
verified_data = self._get_json_from_file(directory, "verified.js")

try:
user_data = {
Expand All @@ -223,7 +295,8 @@ def _construct_user_data(self, directory):

return user_data

def _get_json_from_file(self, filepath):
def _get_json_from_file(self, directory, filepath):
filepath = os.path.join(directory, filepath)
try:
f = open(filepath)
except OSError as e:
Expand Down
20 changes: 11 additions & 9 deletions ditto/twitter/management/commands/import_twitter_tweets.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,18 +37,18 @@ def handle(self, *args, **options):

ingester_class = None

# For v2, the default:
# Where the JS files are:
subpath = "/data"
ingester_class = Version2TweetIngester

if options["archive_version"]:
if options["archive_version"] == "v1":
# Where the JS files are:
subpath = "/data/js/tweets"
ingester_class = Version1TweetIngester

elif options["archive_version"] in ("v2", None):
# Where the JS files are:
subpath = "/data"
ingester_class = Version2TweetIngester

else:
elif options["archive_version"] != "v2":
raise CommandError(
f"version should be v1 or v2, not '{options['archive_version']}"
)
Expand All @@ -68,19 +68,21 @@ def handle(self, *args, **options):
else:
raise CommandError(
(
"Specify the location of the archive, "
"e.g. --path=/Path/To/1234567890_abcdefg12345"
"Specify the location of the archive directory, "
"e.g. --path=/path/to/twitter-2022-01-31-abcdef123456"
)
)

if options.get("verbosity", 1) > 0:
if result["success"]:
tweetnoun = "tweet" if result["tweets"] == 1 else "tweets"
filenoun = "file" if result["files"] == 1 else "files"
mediafilenoun = "file" if result["media"] == 1 else "files"

self.stdout.write(
f"Imported {result['tweets']} {tweetnoun} from "
f"{result['files']} {filenoun}"
f"{result['files']} {filenoun}, "
f"and {result['media']} media {mediafilenoun}"
)
else:

Expand Down
8 changes: 8 additions & 0 deletions ditto/twitter/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,14 @@ class Meta:
verbose_name = "Media item"
verbose_name_plural = "Media items"

@property
def has_file(self):
"Do we have a file saved at all?"
if self.image_file.name or self.mp4_file.name:
return True
else:
return False

@property
def thumbnail_w(self):
"Because we usually actually want 150, not whatever thumb_w is."
Expand Down
12 changes: 12 additions & 0 deletions tests/twitter/fixtures/ingest/v2_with_media/account.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
window.YTD.account.part0 = [
{
"account" : {
"email" : "[email protected]",
"createdVia" : "web",
"username" : "philgyford",
"accountId" : "12552",
"createdAt" : "2006-11-15T16:55:59.000Z",
"accountDisplayName" : "Phil Gyford"
}
}
]
13 changes: 13 additions & 0 deletions tests/twitter/fixtures/ingest/v2_with_media/profile.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
window.YTD.profile.part0 = [
{
"profile" : {
"description" : {
"bio" : "Creator of the Bishop of Manchester’s favourite meme // Also @samuelpepys and @todaysguardian",
"website" : "https://t.co/FsYzXrATit",
"location" : "Herefordshire, UK"
},
"avatarMediaUrl" : "https://pbs.twimg.com/profile_images/1167616130/james_200208_300x300.jpg",
"headerMediaUrl" : "https://pbs.twimg.com/profile_banners/12552/1603038696"
}
}
]
Loading

0 comments on commit 92688fd

Please sign in to comment.