From 81d3c0a555b0db721ffa3c4026cdf1da776ad032 Mon Sep 17 00:00:00 2001 From: Phil Gyford Date: Sat, 12 Feb 2022 11:49:51 +0000 Subject: [PATCH] Make existing TweetIngester Version1TweetIngester So that we can keep it working, for those with older existing Twitter Archive downloads, while adding a newer ingester for those with 2019+ downloads. For #229 --- ditto/twitter/ingest.py | 57 +++++++++++-------- .../commands/import_twitter_tweets.py | 34 +++++++---- .../fixtures/ingest/{ => v1}/2015_08.js | 0 tests/twitter/test_ingest_v1.py | 22 +++---- tests/twitter/test_management_commands.py | 40 ++++++++++--- 5 files changed, 100 insertions(+), 53 deletions(-) rename tests/twitter/fixtures/ingest/{ => v1}/2015_08.js (100%) diff --git a/ditto/twitter/ingest.py b/ditto/twitter/ingest.py index eb07d7bb..5074c81e 100644 --- a/ditto/twitter/ingest.py +++ b/ditto/twitter/ingest.py @@ -62,30 +62,10 @@ def ingest(self, directory): } def _load_data(self, directory): - """Goes through all the *.js files in `directory` and puts the tweet - data inside into self.tweets_data. - - No data is saved to the database until we've successfully loaded JSON - from all of the files. - - Keyword arguments: - directory -- The directory to load the files from. - - Raises: - FetchError -- If the directory is invalid, or there are no .js files, - or we can't load JSON from one of the files. - """ - try: - for file in os.listdir(directory): - if file.endswith(".js"): - filepath = "%s/%s" % (directory, file) - self._get_data_from_file(filepath) - self.file_count += 1 - except OSError as e: - raise IngestError(e) - - if self.file_count == 0: - raise IngestError("No .js files found in %s" % directory) + raise NotImplementedError( + "Child classes of TweetImporter must implement their own " + "_load_data() method." + ) def _get_data_from_file(self, filepath): """Looks in a file, parses its JSON, and adds a dict of data about @@ -116,3 +96,32 @@ def _save_tweets(self): for tweet in self.tweets_data: TweetSaver().save_tweet(tweet, self.fetch_time) self.tweet_count += 1 + + +class Version1TweetIngester(TweetIngester): + + def _load_data(self, directory): + """Goes through all the *.js files in `directory` and puts the tweet + data inside into self.tweets_data. + + No data is saved to the database until we've successfully loaded JSON + from all of the files. + + Keyword arguments: + directory -- The directory to load the files from. + + Raises: + FetchError -- If the directory is invalid, or there are no .js files, + or we can't load JSON from one of the files. + """ + try: + for file in os.listdir(directory): + if file.endswith(".js"): + filepath = "%s/%s" % (directory, file) + self._get_data_from_file(filepath) + self.file_count += 1 + except OSError as e: + raise IngestError(e) + + if self.file_count == 0: + raise IngestError("No .js files found in %s" % directory) \ No newline at end of file diff --git a/ditto/twitter/management/commands/import_twitter_tweets.py b/ditto/twitter/management/commands/import_twitter_tweets.py index e7c183bd..38e175d4 100644 --- a/ditto/twitter/management/commands/import_twitter_tweets.py +++ b/ditto/twitter/management/commands/import_twitter_tweets.py @@ -3,7 +3,7 @@ from django.core.management.base import BaseCommand, CommandError -from ...ingest import TweetIngester +from ...ingest import Version1TweetIngester class Command(BaseCommand): @@ -24,23 +24,39 @@ def add_arguments(self, parser): help="Path to the directory that is the archive", ) + parser.add_argument( + "--archive-version", + action="store", + default=None, + help="v1 or v2 (default). Which format of archives to import from.", + ) + def handle(self, *args, **options): # Location of the directory holding the tweet JSON files within the # archive: subpath = "/data/js/tweets" + ingester_class = None + + if options["archive_version"]: + if options["archive_version"] == "v1": + ingester_class = Version1TweetIngester + else: + raise CommandError( + f"version should be v1 or v2, not '{options['archive_version']}" + ) if options["path"]: if os.path.isdir(options["path"]): tweets_dir = "%s%s" % (options["path"], subpath) if os.path.isdir(tweets_dir): - result = TweetIngester().ingest(directory=tweets_dir) + result = ingester_class().ingest(directory=tweets_dir) else: raise CommandError( - "Expected to find a directory at '%s' containing JSON files" - % tweets_dir + f"Expected to find a directory at '{tweets_dir}' " + "containing JSON files" ) else: - raise CommandError("Can't find a directory at '%s'" % options["path"]) + raise CommandError(f"Can't find a directory at '{options['path']}'") else: raise CommandError( ( @@ -55,11 +71,9 @@ def handle(self, *args, **options): filenoun = "file" if result["files"] == 1 else "files" self.stdout.write( - "Imported %s %s from %s %s" - % (result["tweets"], tweetnoun, result["files"], filenoun) + f"Imported {result['tweets']} {tweetnoun} from " + f"{result['files']} {filenoun}" ) else: - self.stderr.write( - "Failed to import tweets: %s" % (result["messages"][0]) - ) + self.stderr.write(f"Failed to import tweets: {result['messages'][0]}") diff --git a/tests/twitter/fixtures/ingest/2015_08.js b/tests/twitter/fixtures/ingest/v1/2015_08.js similarity index 100% rename from tests/twitter/fixtures/ingest/2015_08.js rename to tests/twitter/fixtures/ingest/v1/2015_08.js diff --git a/tests/twitter/test_ingest_v1.py b/tests/twitter/test_ingest_v1.py index 2e31cb52..55d645f4 100644 --- a/tests/twitter/test_ingest_v1.py +++ b/tests/twitter/test_ingest_v1.py @@ -4,14 +4,14 @@ from django.test import TestCase from ditto.twitter import factories -from ditto.twitter.ingest import IngestError, TweetIngester +from ditto.twitter.ingest import IngestError, Version1TweetIngester from ditto.twitter.models import Tweet -class TweetIngesterTestCase(TestCase): +class Version1TweetIngesterTestCase(TestCase): # A sample file of the format we'd get in a Twitter archive. - ingest_fixture = "tests/twitter/fixtures/ingest/2015_08.js" + ingest_fixture = "tests/twitter/fixtures/ingest/v1/2015_08.js" def get_tweet_data(self): "Returns the JSON tweet data, as text, from the fixture." @@ -23,14 +23,14 @@ def get_tweet_data(self): def test_raises_error_with_invalid_dir(self): with patch("os.path.isdir", return_value=False): with self.assertRaises(IngestError): - TweetIngester().ingest(directory="/bad/dir") + Version1TweetIngester().ingest(directory="/bad/dir") def test_raises_error_with_empty_dir(self): "If no .js files are found, raises IngestError" with patch("os.path.isdir", return_value=True): - with patch("ditto.twitter.ingest.TweetIngester", file_count=0): + with patch("ditto.twitter.ingest.Version1TweetIngester", file_count=0): with self.assertRaises(IngestError): - TweetIngester().ingest(directory="/bad/dir") + Version1TweetIngester().ingest(directory="/bad/dir") # All the below have a similar structure to mock out file-related functions. # Here's what's happening: @@ -56,7 +56,7 @@ def test_raises_error_with_empty_dir(self): # Ingest! This will save Tweets using our fixture data, and imagine it's # loaded data from our fake files: - # result = TweetIngester().ingest(directory='/good/dir') + # result = Version1TweetIngester().ingest(directory='/good/dir') def test_opens_all_files(self): "All the .js files in the directory are opened." @@ -71,7 +71,7 @@ def test_opens_all_files(self): m = mock_open(read_data=file_content) with patch("builtins.open", m): m.return_value.readlines.return_value = file_content.splitlines() - ingester = TweetIngester() + ingester = Version1TweetIngester() ingester.ingest(directory="/good/dir") m.assert_has_calls( [ @@ -91,7 +91,7 @@ def test_saves_all_tweets(self): m = mock_open(read_data=file_content) with patch("builtins.open", m): m.return_value.readlines.return_value = file_content.splitlines() - TweetIngester().ingest(directory="/good/dir") + Version1TweetIngester().ingest(directory="/good/dir") # We load three dummy files; our results have three tweets in each: self.assertEqual(Tweet.objects.count(), 3) @@ -103,7 +103,7 @@ def test_returns_correctly_on_success(self): m = mock_open(read_data=file_content) with patch("builtins.open", m): m.return_value.readlines.return_value = file_content.splitlines() - result = TweetIngester().ingest(directory="/good/dir") + result = Version1TweetIngester().ingest(directory="/good/dir") self.assertTrue(result["success"]) self.assertEqual(result["tweets"], 3) self.assertEqual(result["files"], 1) @@ -116,7 +116,7 @@ def test_returns_correctly_on_success_no_tweets(self): m = mock_open(read_data=file_content) with patch("builtins.open", m): m.return_value.readlines.return_value = file_content.splitlines() - result = TweetIngester().ingest(directory="/good/dir") + result = Version1TweetIngester().ingest(directory="/good/dir") self.assertFalse(result["success"]) self.assertEqual(result["tweets"], 0) self.assertEqual(result["files"], 1) diff --git a/tests/twitter/test_management_commands.py b/tests/twitter/test_management_commands.py index 1391fa36..86a55065 100644 --- a/tests/twitter/test_management_commands.py +++ b/tests/twitter/test_management_commands.py @@ -216,10 +216,12 @@ def test_error_output(self): self.assertIn("Could not fetch @philgyford: It broke", self.out_err.getvalue()) -class ImportTweets(TestCase): +class ImportTweetsVersion1(TestCase): + "Only testing using archive-version=v1 argument" + def setUp(self): self.patcher = patch( - "ditto.twitter.management.commands.import_twitter_tweets.TweetIngester.ingest" # noqa: E501 + "ditto.twitter.management.commands.import_twitter_tweets.Version1TweetIngester.ingest" # noqa: E501 ) self.ingest_mock = self.patcher.start() self.out = StringIO() @@ -233,39 +235,60 @@ def test_fails_with_no_args(self): with self.assertRaises(CommandError): call_command("import_twitter_tweets") + def test_fails_with_invalid_version(self): + with self.assertRaises(CommandError): + call_command( + "import_twitter_tweets", path="/right/path", archive_version="nope" + ) + def test_fails_with_invalid_directory(self): + "Test fails with invalid directory" with patch("os.path.isdir", return_value=False): with self.assertRaises(CommandError): - call_command("import_twitter_tweets", path="/wrong/path") + call_command( + "import_twitter_tweets", path="/wrong/path", archive_version="v1" + ) def test_calls_ingest_method(self): + "Calls correct class and method" with patch("os.path.isdir", return_value=True): - call_command("import_twitter_tweets", path="/right/path", stdout=self.out) + call_command( + "import_twitter_tweets", + path="/right/path", + archive_version="v1", + stdout=self.out, + ) self.ingest_mock.assert_called_once_with( directory="/right/path/data/js/tweets" ) def test_success_output(self): - """Outputs the correct response if ingesting succeeds.""" + """Outputs the correct response if ingesting succeeds""" self.ingest_mock.return_value = {"success": True, "tweets": 12345, "files": 21} with patch("os.path.isdir", return_value=True): - call_command("import_twitter_tweets", path="/right/path", stdout=self.out) + call_command( + "import_twitter_tweets", + path="/right/path", + archive_version="v1", + stdout=self.out, + ) self.assertIn("Imported 12345 tweets from 21 files", self.out.getvalue()) def test_success_output_verbosity_0(self): - """Outputs nothing if ingesting succeeds.""" + """Outputs nothing if ingesting succeeds""" self.ingest_mock.return_value = {"success": True, "tweets": 12345, "files": 21} with patch("os.path.isdir", return_value=True): call_command( "import_twitter_tweets", path="/right/path", + archive_version="v1", verbosity=0, stdout=self.out, ) self.assertEqual("", self.out.getvalue()) def test_error_output(self): - """Outputs the correct error if ingesting fails.""" + """Outputs the correct error if ingesting fails""" self.ingest_mock.return_value = { "success": False, "messages": ["Something went wrong"], @@ -274,6 +297,7 @@ def test_error_output(self): call_command( "import_twitter_tweets", path="/right/path", + archive_version="v1", stdout=self.out, stderr=self.out_err, )