Add importing of media files when importing Tweets from an archive

* The downloaded archive includes all the media files associated with a user's tweets, so we can import them relatively easily. * We import the MP4s Twitter users to display animated GIFs and the image files for JPGs/PNGs. We don't import video files that were uploaded as such because we don't currently include those when fetching media files from the API, so this is to remain consistent. * When we fetch media files for animated GIFs, we fetch both the MP4 and a JPG of it. Although we have the path for both in the tweet data in the archive, only the MP4 is present in the `tweet_media` directory so we only import that. For #229
philgyford · Feb 14, 2022 · 92688fd · 92688fd
1 parent 203797a
commit 92688fd
Show file tree

Hide file tree

Showing 11 changed files with 436 additions and 36 deletions.
diff --git a/ditto/twitter/ingest.py b/ditto/twitter/ingest.py
@@ -1,8 +1,11 @@
-# coding: utf-8
 import json
 import os
+from urllib.parse import urlparse
+
+from django.core.files import File
 
 from .fetch.savers import TweetSaver
+from .models import Media
 from ..core.utils import datetime_now
 
 
@@ -37,6 +40,9 @@ def __init__(self):
         # How mnay tweets we found in all the files:
         self.tweet_count = 0
 
+        # How many media files we imported:
+        self.media_count = 0
+
         # Stores all the imported data from the files before saving.
         # So that we know we've got through all the files within JSON errors
         # etc before we begin touching the DB.
@@ -47,19 +53,23 @@ def ingest(self, directory):
 
         self._load_data(directory)
 
-        self._save_tweets()
+        self._save_tweets(directory)
+
+        self._save_media(directory)
 
         if self.tweet_count > 0:
             return {
                 "success": True,
                 "tweets": self.tweet_count,
                 "files": self.file_count,
+                "media": self.media_count,
             }
         else:
             return {
                 "success": False,
                 "tweets": 0,
                 "files": self.file_count,
+                "media": self.media_count,
                 "messages": ["No tweets were found"],
             }
 
@@ -78,7 +88,7 @@ def _load_data(self, directory):
             "_load_data() method."
         )
 
-    def _save_tweets(self):
+    def _save_tweets(self, directory):
         """Go through the list of dicts that is self.tweets_data and
         create/update each tweet in the DB.
         """
@@ -89,6 +99,12 @@ def _save_tweets(self):
             TweetSaver().save_tweet(tweet, self.fetch_time)
             self.tweet_count += 1
 
+    def _save_media(self, directory):
+        """Save media files.
+        Not doing anything by default.
+        """
+        pass
+
 
 class Version1TweetIngester(TweetIngester):
     """
@@ -153,6 +169,9 @@ class Version2TweetIngester(TweetIngester):
     was introduced sometime between January and May of 2019.
 
     It contains two directories - assets and data - and a "Your archive.html" file.
+
+    This not only saves the Tweet objects but also imports media from the
+    tweet_media directory, saving it as Media files .
     """
 
     def __init__(self):
@@ -171,11 +190,11 @@ def _load_data(self, directory):
 
         self.user_data = self._construct_user_data(directory)
 
-        self.tweets_data = self._get_json_from_file(os.path.join(directory, "tweet.js"))
+        self.tweets_data = self._get_json_from_file(directory, "tweet.js")
 
         self.file_count = 1
 
-    def _save_tweets(self):
+    def _save_tweets(self, directory):
         """
         Save the tweets with our constructed user data.
         """
@@ -188,18 +207,71 @@ def _save_tweets(self):
             TweetSaver().save_tweet(tweet["tweet"], self.fetch_time, self.user_data)
             self.tweet_count += 1
 
+    def _save_media(self, directory):
+        """
+        Save any animated gif's mp4 or an image's file for the saved tweets.
+        """
+
+        for t in self.tweets_data:
+            tweet = t["tweet"]
+
+            if "extended_entities" in tweet and "media" in tweet["extended_entities"]:
+                for item in tweet["extended_entities"]["media"]:
+                    try:
+                        media_obj = Media.objects.get(twitter_id=int(item["id"]))
+                    except Media.DoesNotExist:
+                        pass
+                    else:
+                        if (
+                            media_obj.media_type != "video"
+                            and media_obj.has_file is False
+                        ):
+                            # We don't save video files - only image files, and mp4s for # GIFs - and only want to do this if we don't already have a
+                            # file.
+
+                            if (
+                                media_obj.media_type == "animated_gif"
+                                and media_obj.mp4_url
+                            ):
+                                url = media_obj.mp4_url
+                            elif (
+                                media_obj.media_type == "photo" and media_obj.image_url
+                            ):
+                                url = media_obj.image_url
+
+                            if url:
+                                # Work out name of file in the tweet_media directory:
+                                parsed_url = urlparse(url)
+                                filename = os.path.basename(parsed_url.path)
+                                local_filename = f"{tweet['id_str']}-{filename}"
+                                filepath = os.path.join(
+                                    directory, "tweet_media", local_filename
+                                )
+
+                                django_file = File(open(filepath, "rb"))
+
+                                if media_obj.media_type == "animated_gif":
+                                    # When we fetch GIFs we also fetch an image file for
+                                    # them. But their images aren't included in the
+                                    # downloaded archive so we'll make do without here.
+                                    media_obj.mp4_file.save(filename, django_file)
+                                    self.media_count += 1
+                                elif media_obj.media_type == "photo":
+                                    media_obj.image_file.save(filename, django_file)
+                                    self.media_count += 1
+
     def _construct_user_data(self, directory):
         """
         Make a single dict of data about a user like we'd get from the API.
         This data is in several separate files in the download so we need to
         piece it together from those.
         """
 
-        account_data = self._get_json_from_file(os.path.join(directory, "account.js"))
+        account_data = self._get_json_from_file(directory, "account.js")
 
-        profile_data = self._get_json_from_file(os.path.join(directory, "profile.js"))
+        profile_data = self._get_json_from_file(directory, "profile.js")
 
-        verified_data = self._get_json_from_file(os.path.join(directory, "verified.js"))
+        verified_data = self._get_json_from_file(directory, "verified.js")
 
         try:
             user_data = {
@@ -223,7 +295,8 @@ def _construct_user_data(self, directory):
 
         return user_data
 
-    def _get_json_from_file(self, filepath):
+    def _get_json_from_file(self, directory, filepath):
+        filepath = os.path.join(directory, filepath)
         try:
             f = open(filepath)
         except OSError as e:

diff --git a/ditto/twitter/management/commands/import_twitter_tweets.py b/ditto/twitter/management/commands/import_twitter_tweets.py
@@ -37,18 +37,18 @@ def handle(self, *args, **options):
 
         ingester_class = None
 
+        # For v2, the default:
+        # Where the JS files are:
+        subpath = "/data"
+        ingester_class = Version2TweetIngester
+
         if options["archive_version"]:
             if options["archive_version"] == "v1":
                 # Where the JS files are:
                 subpath = "/data/js/tweets"
                 ingester_class = Version1TweetIngester
 
-            elif options["archive_version"] in ("v2", None):
-                # Where the JS files are:
-                subpath = "/data"
-                ingester_class = Version2TweetIngester
-
-            else:
+            elif options["archive_version"] != "v2":
                 raise CommandError(
                     f"version should be v1 or v2, not '{options['archive_version']}"
                 )
@@ -68,19 +68,21 @@ def handle(self, *args, **options):
         else:
             raise CommandError(
                 (
-                    "Specify the location of the archive, "
-                    "e.g. --path=/Path/To/1234567890_abcdefg12345"
+                    "Specify the location of the archive directory, "
+                    "e.g. --path=/path/to/twitter-2022-01-31-abcdef123456"
                 )
             )
 
         if options.get("verbosity", 1) > 0:
             if result["success"]:
                 tweetnoun = "tweet" if result["tweets"] == 1 else "tweets"
                 filenoun = "file" if result["files"] == 1 else "files"
+                mediafilenoun = "file" if result["media"] == 1 else "files"
 
                 self.stdout.write(
                     f"Imported {result['tweets']} {tweetnoun} from "
-                    f"{result['files']} {filenoun}"
+                    f"{result['files']} {filenoun}, "
+                    f"and {result['media']} media {mediafilenoun}"
                 )
             else:
 

diff --git a/ditto/twitter/models.py b/ditto/twitter/models.py
@@ -235,6 +235,14 @@ class Meta:
         verbose_name = "Media item"
         verbose_name_plural = "Media items"
 
+    @property
+    def has_file(self):
+        "Do we have a file saved at all?"
+        if self.image_file.name or self.mp4_file.name:
+            return True
+        else:
+            return False
+
     @property
     def thumbnail_w(self):
         "Because we usually actually want 150, not whatever thumb_w is."

diff --git a/tests/twitter/fixtures/ingest/v2_with_media/account.js b/tests/twitter/fixtures/ingest/v2_with_media/account.js
@@ -0,0 +1,12 @@
+window.YTD.account.part0 = [
+  {
+    "account" : {
+      "email" : "[email protected]",
+      "createdVia" : "web",
+      "username" : "philgyford",
+      "accountId" : "12552",
+      "createdAt" : "2006-11-15T16:55:59.000Z",
+      "accountDisplayName" : "Phil Gyford"
+    }
+  }
+]
diff --git a/tests/twitter/fixtures/ingest/v2_with_media/profile.js b/tests/twitter/fixtures/ingest/v2_with_media/profile.js
@@ -0,0 +1,13 @@
+window.YTD.profile.part0 = [
+  {
+    "profile" : {
+      "description" : {
+        "bio" : "Creator of the Bishop of Manchester’s favourite meme // Also @samuelpepys and @todaysguardian",
+        "website" : "https://t.co/FsYzXrATit",
+        "location" : "Herefordshire, UK"
+      },
+      "avatarMediaUrl" : "https://pbs.twimg.com/profile_images/1167616130/james_200208_300x300.jpg",
+      "headerMediaUrl" : "https://pbs.twimg.com/profile_banners/12552/1603038696"
+    }
+  }
+]