diff --git a/ditto/twitter/ingest.py b/ditto/twitter/ingest.py index f266aef..41a68e1 100644 --- a/ditto/twitter/ingest.py +++ b/ditto/twitter/ingest.py @@ -1,8 +1,11 @@ -# coding: utf-8 import json import os +from urllib.parse import urlparse + +from django.core.files import File from .fetch.savers import TweetSaver +from .models import Media from ..core.utils import datetime_now @@ -37,6 +40,9 @@ def __init__(self): # How mnay tweets we found in all the files: self.tweet_count = 0 + # How many media files we imported: + self.media_count = 0 + # Stores all the imported data from the files before saving. # So that we know we've got through all the files within JSON errors # etc before we begin touching the DB. @@ -47,19 +53,23 @@ def ingest(self, directory): self._load_data(directory) - self._save_tweets() + self._save_tweets(directory) + + self._save_media(directory) if self.tweet_count > 0: return { "success": True, "tweets": self.tweet_count, "files": self.file_count, + "media": self.media_count, } else: return { "success": False, "tweets": 0, "files": self.file_count, + "media": self.media_count, "messages": ["No tweets were found"], } @@ -78,7 +88,7 @@ def _load_data(self, directory): "_load_data() method." ) - def _save_tweets(self): + def _save_tweets(self, directory): """Go through the list of dicts that is self.tweets_data and create/update each tweet in the DB. """ @@ -89,6 +99,12 @@ def _save_tweets(self): TweetSaver().save_tweet(tweet, self.fetch_time) self.tweet_count += 1 + def _save_media(self, directory): + """Save media files. + Not doing anything by default. + """ + pass + class Version1TweetIngester(TweetIngester): """ @@ -153,6 +169,9 @@ class Version2TweetIngester(TweetIngester): was introduced sometime between January and May of 2019. It contains two directories - assets and data - and a "Your archive.html" file. + + This not only saves the Tweet objects but also imports media from the + tweet_media directory, saving it as Media files . """ def __init__(self): @@ -171,11 +190,11 @@ def _load_data(self, directory): self.user_data = self._construct_user_data(directory) - self.tweets_data = self._get_json_from_file(os.path.join(directory, "tweet.js")) + self.tweets_data = self._get_json_from_file(directory, "tweet.js") self.file_count = 1 - def _save_tweets(self): + def _save_tweets(self, directory): """ Save the tweets with our constructed user data. """ @@ -188,6 +207,59 @@ def _save_tweets(self): TweetSaver().save_tweet(tweet["tweet"], self.fetch_time, self.user_data) self.tweet_count += 1 + def _save_media(self, directory): + """ + Save any animated gif's mp4 or an image's file for the saved tweets. + """ + + for t in self.tweets_data: + tweet = t["tweet"] + + if "extended_entities" in tweet and "media" in tweet["extended_entities"]: + for item in tweet["extended_entities"]["media"]: + try: + media_obj = Media.objects.get(twitter_id=int(item["id"])) + except Media.DoesNotExist: + pass + else: + if ( + media_obj.media_type != "video" + and media_obj.has_file is False + ): + # We don't save video files - only image files, and mp4s for # GIFs - and only want to do this if we don't already have a + # file. + + if ( + media_obj.media_type == "animated_gif" + and media_obj.mp4_url + ): + url = media_obj.mp4_url + elif ( + media_obj.media_type == "photo" and media_obj.image_url + ): + url = media_obj.image_url + + if url: + # Work out name of file in the tweet_media directory: + parsed_url = urlparse(url) + filename = os.path.basename(parsed_url.path) + local_filename = f"{tweet['id_str']}-{filename}" + filepath = os.path.join( + directory, "tweet_media", local_filename + ) + + django_file = File(open(filepath, "rb")) + + if media_obj.media_type == "animated_gif": + # When we fetch GIFs we also fetch an image file for + # them. But their images aren't included in the + # downloaded archive so we'll make do without here. + media_obj.mp4_file.save(filename, django_file) + self.media_count += 1 + elif media_obj.media_type == "photo": + media_obj.image_file.save(filename, django_file) + self.media_count += 1 + def _construct_user_data(self, directory): """ Make a single dict of data about a user like we'd get from the API. @@ -195,11 +267,11 @@ def _construct_user_data(self, directory): piece it together from those. """ - account_data = self._get_json_from_file(os.path.join(directory, "account.js")) + account_data = self._get_json_from_file(directory, "account.js") - profile_data = self._get_json_from_file(os.path.join(directory, "profile.js")) + profile_data = self._get_json_from_file(directory, "profile.js") - verified_data = self._get_json_from_file(os.path.join(directory, "verified.js")) + verified_data = self._get_json_from_file(directory, "verified.js") try: user_data = { @@ -223,7 +295,8 @@ def _construct_user_data(self, directory): return user_data - def _get_json_from_file(self, filepath): + def _get_json_from_file(self, directory, filepath): + filepath = os.path.join(directory, filepath) try: f = open(filepath) except OSError as e: diff --git a/ditto/twitter/management/commands/import_twitter_tweets.py b/ditto/twitter/management/commands/import_twitter_tweets.py index e70fa07..b8c35cd 100644 --- a/ditto/twitter/management/commands/import_twitter_tweets.py +++ b/ditto/twitter/management/commands/import_twitter_tweets.py @@ -37,18 +37,18 @@ def handle(self, *args, **options): ingester_class = None + # For v2, the default: + # Where the JS files are: + subpath = "/data" + ingester_class = Version2TweetIngester + if options["archive_version"]: if options["archive_version"] == "v1": # Where the JS files are: subpath = "/data/js/tweets" ingester_class = Version1TweetIngester - elif options["archive_version"] in ("v2", None): - # Where the JS files are: - subpath = "/data" - ingester_class = Version2TweetIngester - - else: + elif options["archive_version"] != "v2": raise CommandError( f"version should be v1 or v2, not '{options['archive_version']}" ) @@ -68,8 +68,8 @@ def handle(self, *args, **options): else: raise CommandError( ( - "Specify the location of the archive, " - "e.g. --path=/Path/To/1234567890_abcdefg12345" + "Specify the location of the archive directory, " + "e.g. --path=/path/to/twitter-2022-01-31-abcdef123456" ) ) @@ -77,10 +77,12 @@ def handle(self, *args, **options): if result["success"]: tweetnoun = "tweet" if result["tweets"] == 1 else "tweets" filenoun = "file" if result["files"] == 1 else "files" + mediafilenoun = "file" if result["media"] == 1 else "files" self.stdout.write( f"Imported {result['tweets']} {tweetnoun} from " - f"{result['files']} {filenoun}" + f"{result['files']} {filenoun}, " + f"and {result['media']} media {mediafilenoun}" ) else: diff --git a/ditto/twitter/models.py b/ditto/twitter/models.py index 3a9cf2f..a236fb1 100644 --- a/ditto/twitter/models.py +++ b/ditto/twitter/models.py @@ -235,6 +235,14 @@ class Meta: verbose_name = "Media item" verbose_name_plural = "Media items" + @property + def has_file(self): + "Do we have a file saved at all?" + if self.image_file.name or self.mp4_file.name: + return True + else: + return False + @property def thumbnail_w(self): "Because we usually actually want 150, not whatever thumb_w is." diff --git a/tests/twitter/fixtures/ingest/v2_with_media/account.js b/tests/twitter/fixtures/ingest/v2_with_media/account.js new file mode 100644 index 0000000..ed9567e --- /dev/null +++ b/tests/twitter/fixtures/ingest/v2_with_media/account.js @@ -0,0 +1,12 @@ +window.YTD.account.part0 = [ + { + "account" : { + "email" : "phil@gyford.com", + "createdVia" : "web", + "username" : "philgyford", + "accountId" : "12552", + "createdAt" : "2006-11-15T16:55:59.000Z", + "accountDisplayName" : "Phil Gyford" + } + } +] \ No newline at end of file diff --git a/tests/twitter/fixtures/ingest/v2_with_media/profile.js b/tests/twitter/fixtures/ingest/v2_with_media/profile.js new file mode 100644 index 0000000..c3f486c --- /dev/null +++ b/tests/twitter/fixtures/ingest/v2_with_media/profile.js @@ -0,0 +1,13 @@ +window.YTD.profile.part0 = [ + { + "profile" : { + "description" : { + "bio" : "Creator of the Bishop of Manchester’s favourite meme // Also @samuelpepys and @todaysguardian", + "website" : "https://t.co/FsYzXrATit", + "location" : "Herefordshire, UK" + }, + "avatarMediaUrl" : "https://pbs.twimg.com/profile_images/1167616130/james_200208_300x300.jpg", + "headerMediaUrl" : "https://pbs.twimg.com/profile_banners/12552/1603038696" + } + } +] \ No newline at end of file diff --git a/tests/twitter/fixtures/ingest/v2_with_media/tweet.js b/tests/twitter/fixtures/ingest/v2_with_media/tweet.js new file mode 100644 index 0000000..4209cb3 --- /dev/null +++ b/tests/twitter/fixtures/ingest/v2_with_media/tweet.js @@ -0,0 +1,243 @@ +window.YTD.tweet.part0 = [ + { + "tweet" : { + "retweeted" : false, + "source" : "Tweetbot for Mac", + "entities" : { + "user_mentions" : [ ], + "urls" : [ + { + "url" : "https://t.co/piSTKg40ct", + "expanded_url" : "http://Dropbox.com", + "display_url" : "Dropbox.com", + "indices" : [ + "109", + "132" + ] + } + ], + "symbols" : [ ], + "media" : [ + { + "expanded_url" : "https://twitter.com/philgyford/status/1247471193357275137/photo/1", + "indices" : [ + "237", + "260" + ], + "url" : "https://t.co/DcKCkNekxp", + "media_url" : "http://pbs.twimg.com/tweet_video_thumb/EU_oaKjWkAAj6Gv.jpg", + "id_str" : "1247471158011793408", + "id" : "1247471158011793408", + "media_url_https" : "https://pbs.twimg.com/tweet_video_thumb/EU_oaKjWkAAj6Gv.jpg", + "sizes" : { + "small" : { + "w" : "250", + "h" : "330", + "resize" : "fit" + }, + "thumb" : { + "w" : "150", + "h" : "150", + "resize" : "crop" + }, + "medium" : { + "w" : "250", + "h" : "330", + "resize" : "fit" + }, + "large" : { + "w" : "250", + "h" : "330", + "resize" : "fit" + } + }, + "type" : "photo", + "display_url" : "pic.twitter.com/DcKCkNekxp" + } + ], + "hashtags" : [ ] + }, + "display_text_range" : [ + "0", + "260" + ], + "favorite_count" : "1", + "id_str" : "1247471193357275137", + "truncated" : false, + "retweet_count" : "0", + "id" : "1247471193357275137", + "possibly_sensitive" : false, + "created_at" : "Tue Apr 07 10:28:04 +0000 2020", + "favorited" : false, + "full_text" : "The good thing is that once I've finished the laborious process of downloading 39 separate shared folders on https://t.co/piSTKg40ct, the day can only get better.\n\nThe weirdly-changing top-level navigation is the least confusing aspect. https://t.co/DcKCkNekxp", + "lang" : "en", + "extended_entities" : { + "media" : [ + { + "expanded_url" : "https://twitter.com/philgyford/status/1247471193357275137/photo/1", + "indices" : [ + "237", + "260" + ], + "url" : "https://t.co/DcKCkNekxp", + "media_url" : "http://pbs.twimg.com/tweet_video_thumb/EU_oaKjWkAAj6Gv.jpg", + "id_str" : "1247471158011793408", + "video_info" : { + "aspect_ratio" : [ + "25", + "33" + ], + "variants" : [ + { + "bitrate" : "0", + "content_type" : "video/mp4", + "url" : "https://video.twimg.com/tweet_video/EU_oaKjWkAAj6Gv.mp4" + } + ] + }, + "id" : "1247471158011793408", + "media_url_https" : "https://pbs.twimg.com/tweet_video_thumb/EU_oaKjWkAAj6Gv.jpg", + "sizes" : { + "small" : { + "w" : "250", + "h" : "330", + "resize" : "fit" + }, + "thumb" : { + "w" : "150", + "h" : "150", + "resize" : "crop" + }, + "medium" : { + "w" : "250", + "h" : "330", + "resize" : "fit" + }, + "large" : { + "w" : "250", + "h" : "330", + "resize" : "fit" + } + }, + "type" : "animated_gif", + "display_url" : "pic.twitter.com/DcKCkNekxp" + } + ] + } + } + }, + { + "tweet" : { + "retweeted" : false, + "source" : "Twitter for iPad", + "entities" : { + "user_mentions" : [ ], + "urls" : [ + { + "url" : "https://t.co/6hPhxAcr9n", + "expanded_url" : "https://twitter.com/peterme/status/1358811736464191488", + "display_url" : "twitter.com/peterme/status…", + "indices" : [ + "179", + "202" + ] + } + ], + "symbols" : [ ], + "media" : [ + { + "expanded_url" : "https://twitter.com/philgyford/status/1359135226161750021/photo/1", + "indices" : [ + "203", + "226" + ], + "url" : "https://t.co/0tWLxyETsw", + "media_url" : "http://pbs.twimg.com/media/EtyeQ0FWYAEQDqk.jpg", + "id_str" : "1359135199255224321", + "id" : "1359135199255224321", + "media_url_https" : "https://pbs.twimg.com/media/EtyeQ0FWYAEQDqk.jpg", + "sizes" : { + "large" : { + "w" : "248", + "h" : "256", + "resize" : "fit" + }, + "thumb" : { + "w" : "150", + "h" : "150", + "resize" : "crop" + }, + "medium" : { + "w" : "248", + "h" : "256", + "resize" : "fit" + }, + "small" : { + "w" : "248", + "h" : "256", + "resize" : "fit" + } + }, + "type" : "photo", + "display_url" : "pic.twitter.com/0tWLxyETsw" + } + ], + "hashtags" : [ ] + }, + "display_text_range" : [ + "0", + "226" + ], + "favorite_count" : "3", + "id_str" : "1359135226161750021", + "truncated" : false, + "retweet_count" : "0", + "id" : "1359135226161750021", + "possibly_sensitive" : false, + "created_at" : "Tue Feb 09 13:41:04 +0000 2021", + "favorited" : false, + "full_text" : "I don’t know why I’d reveal my age when mentioning this anyway, but I think it was QS Defender, at my friend Simon’s. He had a 16K RAM Pack, if you can imagine such extravagance. https://t.co/6hPhxAcr9n https://t.co/0tWLxyETsw", + "lang" : "en", + "extended_entities" : { + "media" : [ + { + "expanded_url" : "https://twitter.com/philgyford/status/1359135226161750021/photo/1", + "indices" : [ + "203", + "226" + ], + "url" : "https://t.co/0tWLxyETsw", + "media_url" : "http://pbs.twimg.com/media/EtyeQ0FWYAEQDqk.jpg", + "id_str" : "1359135199255224321", + "id" : "1359135199255224321", + "media_url_https" : "https://pbs.twimg.com/media/EtyeQ0FWYAEQDqk.jpg", + "sizes" : { + "large" : { + "w" : "248", + "h" : "256", + "resize" : "fit" + }, + "thumb" : { + "w" : "150", + "h" : "150", + "resize" : "crop" + }, + "medium" : { + "w" : "248", + "h" : "256", + "resize" : "fit" + }, + "small" : { + "w" : "248", + "h" : "256", + "resize" : "fit" + } + }, + "type" : "photo", + "display_url" : "pic.twitter.com/0tWLxyETsw" + } + ] + } + } + } +] \ No newline at end of file diff --git a/tests/twitter/fixtures/ingest/v2_with_media/tweet_media/1247471193357275137-EU_oaKjWkAAj6Gv.mp4 b/tests/twitter/fixtures/ingest/v2_with_media/tweet_media/1247471193357275137-EU_oaKjWkAAj6Gv.mp4 new file mode 100644 index 0000000..b9aa006 Binary files /dev/null and b/tests/twitter/fixtures/ingest/v2_with_media/tweet_media/1247471193357275137-EU_oaKjWkAAj6Gv.mp4 differ diff --git a/tests/twitter/fixtures/ingest/v2_with_media/tweet_media/1359135226161750021-EtyeQ0FWYAEQDqk.jpg b/tests/twitter/fixtures/ingest/v2_with_media/tweet_media/1359135226161750021-EtyeQ0FWYAEQDqk.jpg new file mode 100644 index 0000000..4393be5 Binary files /dev/null and b/tests/twitter/fixtures/ingest/v2_with_media/tweet_media/1359135226161750021-EtyeQ0FWYAEQDqk.jpg differ diff --git a/tests/twitter/fixtures/ingest/v2_with_media/verified.js b/tests/twitter/fixtures/ingest/v2_with_media/verified.js new file mode 100644 index 0000000..2b0d9bd --- /dev/null +++ b/tests/twitter/fixtures/ingest/v2_with_media/verified.js @@ -0,0 +1,8 @@ +window.YTD.verified.part0 = [ + { + "verified" : { + "accountId" : "12552", + "verified" : false + } + } +] \ No newline at end of file diff --git a/tests/twitter/test_ingest_v2.py b/tests/twitter/test_ingest_v2.py index ce6fc28..7657ffa 100644 --- a/tests/twitter/test_ingest_v2.py +++ b/tests/twitter/test_ingest_v2.py @@ -9,20 +9,20 @@ from ditto.twitter.models import Tweet, User -# e.g. /path/to/django-ditto/tests/twitter/fixtures/ingest/v2/ +# e.g. /path/to/django-ditto/tests/twitter/fixtures/ingest FIXTURES_DIR = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "fixtures", "ingest", "v2" -) -FIXTURES_DIR_NO_TWEETS = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "fixtures", "ingest", "v2_no_tweets" + os.path.dirname(os.path.abspath(__file__)), "fixtures", "ingest" ) +FIXTURES_DIR_WITH_TWEETS = os.path.join(FIXTURES_DIR, "v2") +FIXTURES_DIR_NO_TWEETS = os.path.join(FIXTURES_DIR, "v2_no_tweets") +FIXTURES_DIR_WITH_MEDIA = os.path.join(FIXTURES_DIR, "v2_with_media") class Version2TweetIngesterTestCase(TestCase): def test_saves_all_tweets(self): "Saves the tweets to the DB" ingester = Version2TweetIngester() - ingester.ingest(directory=FIXTURES_DIR) + ingester.ingest(directory=FIXTURES_DIR_WITH_TWEETS) self.assertEqual(Tweet.objects.count(), 2) def test_associates_tweets_with_user(self): @@ -30,7 +30,7 @@ def test_associates_tweets_with_user(self): We should check, given it's all got from separate files. """ ingester = Version2TweetIngester() - ingester.ingest(directory=FIXTURES_DIR) + ingester.ingest(directory=FIXTURES_DIR_WITH_TWEETS) tweet = Tweet.objects.first() user = User.objects.first() @@ -40,7 +40,7 @@ def test_associates_tweets_with_user(self): def test_saves_user_data(self): "Saves data about the user in each tweet's json" ingester = Version2TweetIngester() - ingester.ingest(directory=FIXTURES_DIR) + ingester.ingest(directory=FIXTURES_DIR_WITH_TWEETS) user = User.objects.get(twitter_id=12552) raw = json.loads(user.raw) @@ -65,19 +65,41 @@ def test_saves_user_data(self): def test_saves_user_object(self): "Saves the user object correctly" ingester = Version2TweetIngester() - ingester.ingest(directory=FIXTURES_DIR) + ingester.ingest(directory=FIXTURES_DIR_WITH_TWEETS) user = User.objects.get(twitter_id=12552) self.assertEqual(user.screen_name, "philgyford") self.assertEqual(user.is_private, False) + def test_imports_media_files(self): + "It should create Media objects and import their files" + ingester = Version2TweetIngester() + ingester.ingest(directory=FIXTURES_DIR_WITH_MEDIA) + + # animated_gif + tweet = Tweet.objects.get(twitter_id=1247471193357275137) + media = tweet.media.first() + self.assertEqual(media.media_type, "animated_gif") + self.assertEqual(media.mp4_file.name, "twitter/media/j6/Gv/EU_oaKjWkAAj6Gv.mp4") + self.assertTrue(os.path.isfile(media.mp4_file.path)) + + # image + tweet = Tweet.objects.get(twitter_id=1359135226161750021) + media = tweet.media.first() + self.assertEqual(media.media_type, "photo") + self.assertEqual( + media.image_file.name, "twitter/media/QD/qk/EtyeQ0FWYAEQDqk.jpg" + ) + self.assertTrue(os.path.isfile(media.image_file.path)) + def test_returns_correctly_on_success(self): "After successfully importing tweets, returns correct data" ingester = Version2TweetIngester() - result = ingester.ingest(directory=FIXTURES_DIR) + result = ingester.ingest(directory=FIXTURES_DIR_WITH_TWEETS) self.assertTrue(result["success"]) self.assertEqual(result["tweets"], 2) self.assertEqual(result["files"], 1) + self.assertEqual(result["media"], 0) def test_returns_correctly_on_success_no_tweets(self): "No exceptions, but no tweets were imported; is correct data returned?" @@ -86,4 +108,13 @@ def test_returns_correctly_on_success_no_tweets(self): self.assertFalse(result["success"]) self.assertEqual(result["tweets"], 0) self.assertEqual(result["files"], 1) + self.assertEqual(result["media"], 0) self.assertEqual(result["messages"][0], "No tweets were found") + + def test_returns_correctly_on_success_with_media_files(self): + ingester = Version2TweetIngester() + result = ingester.ingest(directory=FIXTURES_DIR_WITH_MEDIA) + self.assertTrue(result["success"]) + self.assertEqual(result["tweets"], 2) + self.assertEqual(result["files"], 1) + self.assertEqual(result["media"], 2) diff --git a/tests/twitter/test_management_commands.py b/tests/twitter/test_management_commands.py index 3f1da60..952af42 100644 --- a/tests/twitter/test_management_commands.py +++ b/tests/twitter/test_management_commands.py @@ -266,8 +266,7 @@ def test_calls_ingest_method(self): class ImportTweetsVersion2(TestCase): - """Only testing using --archive-version=v2 argument - """ + """Only testing using --archive-version=v2 argument""" def setUp(self): self.patcher = patch( @@ -297,13 +296,16 @@ def test_calls_ingest_method(self): archive_version="v2", stdout=self.out, ) - self.ingest_mock.assert_called_once_with( - directory="/right/path/data" - ) + self.ingest_mock.assert_called_once_with(directory="/right/path/data") def test_success_output(self): """Outputs the correct response if ingesting succeeds""" - self.ingest_mock.return_value = {"success": True, "tweets": 12345, "files": 21} + self.ingest_mock.return_value = { + "success": True, + "tweets": 12345, + "files": 1, + "media": 345, + } with patch("os.path.isdir", return_value=True): call_command( "import_twitter_tweets", @@ -311,11 +313,19 @@ def test_success_output(self): archive_version="v2", stdout=self.out, ) - self.assertIn("Imported 12345 tweets from 21 files", self.out.getvalue()) + self.assertIn( + "Imported 12345 tweets from 1 file, and 345 media files", + self.out.getvalue(), + ) def test_success_output_verbosity_0(self): """Outputs nothing if ingesting succeeds""" - self.ingest_mock.return_value = {"success": True, "tweets": 12345, "files": 21} + self.ingest_mock.return_value = { + "success": True, + "tweets": 12345, + "files": 1, + "media": 345, + } with patch("os.path.isdir", return_value=True): call_command( "import_twitter_tweets",