Skip to content

Commit

Permalink
Make existing TweetIngester Version1TweetIngester
Browse files Browse the repository at this point in the history
So that we can keep it working, for those with older existing Twitter
Archive downloads, while adding a newer ingester for those with 2019+
downloads.

For #229
  • Loading branch information
philgyford committed Feb 12, 2022
1 parent 1eb5445 commit 81d3c0a
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 53 deletions.
57 changes: 33 additions & 24 deletions ditto/twitter/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,30 +62,10 @@ def ingest(self, directory):
}

def _load_data(self, directory):
"""Goes through all the *.js files in `directory` and puts the tweet
data inside into self.tweets_data.
No data is saved to the database until we've successfully loaded JSON
from all of the files.
Keyword arguments:
directory -- The directory to load the files from.
Raises:
FetchError -- If the directory is invalid, or there are no .js files,
or we can't load JSON from one of the files.
"""
try:
for file in os.listdir(directory):
if file.endswith(".js"):
filepath = "%s/%s" % (directory, file)
self._get_data_from_file(filepath)
self.file_count += 1
except OSError as e:
raise IngestError(e)

if self.file_count == 0:
raise IngestError("No .js files found in %s" % directory)
raise NotImplementedError(
"Child classes of TweetImporter must implement their own "
"_load_data() method."
)

def _get_data_from_file(self, filepath):
"""Looks in a file, parses its JSON, and adds a dict of data about
Expand Down Expand Up @@ -116,3 +96,32 @@ def _save_tweets(self):
for tweet in self.tweets_data:
TweetSaver().save_tweet(tweet, self.fetch_time)
self.tweet_count += 1


class Version1TweetIngester(TweetIngester):

def _load_data(self, directory):
"""Goes through all the *.js files in `directory` and puts the tweet
data inside into self.tweets_data.
No data is saved to the database until we've successfully loaded JSON
from all of the files.
Keyword arguments:
directory -- The directory to load the files from.
Raises:
FetchError -- If the directory is invalid, or there are no .js files,
or we can't load JSON from one of the files.
"""
try:
for file in os.listdir(directory):
if file.endswith(".js"):
filepath = "%s/%s" % (directory, file)
self._get_data_from_file(filepath)
self.file_count += 1
except OSError as e:
raise IngestError(e)

if self.file_count == 0:
raise IngestError("No .js files found in %s" % directory)
34 changes: 24 additions & 10 deletions ditto/twitter/management/commands/import_twitter_tweets.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from django.core.management.base import BaseCommand, CommandError

from ...ingest import TweetIngester
from ...ingest import Version1TweetIngester


class Command(BaseCommand):
Expand All @@ -24,23 +24,39 @@ def add_arguments(self, parser):
help="Path to the directory that is the archive",
)

parser.add_argument(
"--archive-version",
action="store",
default=None,
help="v1 or v2 (default). Which format of archives to import from.",
)

def handle(self, *args, **options):
# Location of the directory holding the tweet JSON files within the
# archive:
subpath = "/data/js/tweets"

ingester_class = None

if options["archive_version"]:
if options["archive_version"] == "v1":
ingester_class = Version1TweetIngester
else:
raise CommandError(
f"version should be v1 or v2, not '{options['archive_version']}"
)
if options["path"]:
if os.path.isdir(options["path"]):
tweets_dir = "%s%s" % (options["path"], subpath)
if os.path.isdir(tweets_dir):
result = TweetIngester().ingest(directory=tweets_dir)
result = ingester_class().ingest(directory=tweets_dir)
else:
raise CommandError(
"Expected to find a directory at '%s' containing JSON files"
% tweets_dir
f"Expected to find a directory at '{tweets_dir}' "
"containing JSON files"
)
else:
raise CommandError("Can't find a directory at '%s'" % options["path"])
raise CommandError(f"Can't find a directory at '{options['path']}'")
else:
raise CommandError(
(
Expand All @@ -55,11 +71,9 @@ def handle(self, *args, **options):
filenoun = "file" if result["files"] == 1 else "files"

self.stdout.write(
"Imported %s %s from %s %s"
% (result["tweets"], tweetnoun, result["files"], filenoun)
f"Imported {result['tweets']} {tweetnoun} from "
f"{result['files']} {filenoun}"
)
else:

self.stderr.write(
"Failed to import tweets: %s" % (result["messages"][0])
)
self.stderr.write(f"Failed to import tweets: {result['messages'][0]}")
File renamed without changes.
22 changes: 11 additions & 11 deletions tests/twitter/test_ingest_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
from django.test import TestCase

from ditto.twitter import factories
from ditto.twitter.ingest import IngestError, TweetIngester
from ditto.twitter.ingest import IngestError, Version1TweetIngester
from ditto.twitter.models import Tweet


class TweetIngesterTestCase(TestCase):
class Version1TweetIngesterTestCase(TestCase):

# A sample file of the format we'd get in a Twitter archive.
ingest_fixture = "tests/twitter/fixtures/ingest/2015_08.js"
ingest_fixture = "tests/twitter/fixtures/ingest/v1/2015_08.js"

def get_tweet_data(self):
"Returns the JSON tweet data, as text, from the fixture."
Expand All @@ -23,14 +23,14 @@ def get_tweet_data(self):
def test_raises_error_with_invalid_dir(self):
with patch("os.path.isdir", return_value=False):
with self.assertRaises(IngestError):
TweetIngester().ingest(directory="/bad/dir")
Version1TweetIngester().ingest(directory="/bad/dir")

def test_raises_error_with_empty_dir(self):
"If no .js files are found, raises IngestError"
with patch("os.path.isdir", return_value=True):
with patch("ditto.twitter.ingest.TweetIngester", file_count=0):
with patch("ditto.twitter.ingest.Version1TweetIngester", file_count=0):
with self.assertRaises(IngestError):
TweetIngester().ingest(directory="/bad/dir")
Version1TweetIngester().ingest(directory="/bad/dir")

# All the below have a similar structure to mock out file-related functions.
# Here's what's happening:
Expand All @@ -56,7 +56,7 @@ def test_raises_error_with_empty_dir(self):

# Ingest! This will save Tweets using our fixture data, and imagine it's
# loaded data from our fake files:
# result = TweetIngester().ingest(directory='/good/dir')
# result = Version1TweetIngester().ingest(directory='/good/dir')

def test_opens_all_files(self):
"All the .js files in the directory are opened."
Expand All @@ -71,7 +71,7 @@ def test_opens_all_files(self):
m = mock_open(read_data=file_content)
with patch("builtins.open", m):
m.return_value.readlines.return_value = file_content.splitlines()
ingester = TweetIngester()
ingester = Version1TweetIngester()
ingester.ingest(directory="/good/dir")
m.assert_has_calls(
[
Expand All @@ -91,7 +91,7 @@ def test_saves_all_tweets(self):
m = mock_open(read_data=file_content)
with patch("builtins.open", m):
m.return_value.readlines.return_value = file_content.splitlines()
TweetIngester().ingest(directory="/good/dir")
Version1TweetIngester().ingest(directory="/good/dir")
# We load three dummy files; our results have three tweets in each:
self.assertEqual(Tweet.objects.count(), 3)

Expand All @@ -103,7 +103,7 @@ def test_returns_correctly_on_success(self):
m = mock_open(read_data=file_content)
with patch("builtins.open", m):
m.return_value.readlines.return_value = file_content.splitlines()
result = TweetIngester().ingest(directory="/good/dir")
result = Version1TweetIngester().ingest(directory="/good/dir")
self.assertTrue(result["success"])
self.assertEqual(result["tweets"], 3)
self.assertEqual(result["files"], 1)
Expand All @@ -116,7 +116,7 @@ def test_returns_correctly_on_success_no_tweets(self):
m = mock_open(read_data=file_content)
with patch("builtins.open", m):
m.return_value.readlines.return_value = file_content.splitlines()
result = TweetIngester().ingest(directory="/good/dir")
result = Version1TweetIngester().ingest(directory="/good/dir")
self.assertFalse(result["success"])
self.assertEqual(result["tweets"], 0)
self.assertEqual(result["files"], 1)
Expand Down
40 changes: 32 additions & 8 deletions tests/twitter/test_management_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,10 +216,12 @@ def test_error_output(self):
self.assertIn("Could not fetch @philgyford: It broke", self.out_err.getvalue())


class ImportTweets(TestCase):
class ImportTweetsVersion1(TestCase):
"Only testing using archive-version=v1 argument"

def setUp(self):
self.patcher = patch(
"ditto.twitter.management.commands.import_twitter_tweets.TweetIngester.ingest" # noqa: E501
"ditto.twitter.management.commands.import_twitter_tweets.Version1TweetIngester.ingest" # noqa: E501
)
self.ingest_mock = self.patcher.start()
self.out = StringIO()
Expand All @@ -233,39 +235,60 @@ def test_fails_with_no_args(self):
with self.assertRaises(CommandError):
call_command("import_twitter_tweets")

def test_fails_with_invalid_version(self):
with self.assertRaises(CommandError):
call_command(
"import_twitter_tweets", path="/right/path", archive_version="nope"
)

def test_fails_with_invalid_directory(self):
"Test fails with invalid directory"
with patch("os.path.isdir", return_value=False):
with self.assertRaises(CommandError):
call_command("import_twitter_tweets", path="/wrong/path")
call_command(
"import_twitter_tweets", path="/wrong/path", archive_version="v1"
)

def test_calls_ingest_method(self):
"Calls correct class and method"
with patch("os.path.isdir", return_value=True):
call_command("import_twitter_tweets", path="/right/path", stdout=self.out)
call_command(
"import_twitter_tweets",
path="/right/path",
archive_version="v1",
stdout=self.out,
)
self.ingest_mock.assert_called_once_with(
directory="/right/path/data/js/tweets"
)

def test_success_output(self):
"""Outputs the correct response if ingesting succeeds."""
"""Outputs the correct response if ingesting succeeds"""
self.ingest_mock.return_value = {"success": True, "tweets": 12345, "files": 21}
with patch("os.path.isdir", return_value=True):
call_command("import_twitter_tweets", path="/right/path", stdout=self.out)
call_command(
"import_twitter_tweets",
path="/right/path",
archive_version="v1",
stdout=self.out,
)
self.assertIn("Imported 12345 tweets from 21 files", self.out.getvalue())

def test_success_output_verbosity_0(self):
"""Outputs nothing if ingesting succeeds."""
"""Outputs nothing if ingesting succeeds"""
self.ingest_mock.return_value = {"success": True, "tweets": 12345, "files": 21}
with patch("os.path.isdir", return_value=True):
call_command(
"import_twitter_tweets",
path="/right/path",
archive_version="v1",
verbosity=0,
stdout=self.out,
)
self.assertEqual("", self.out.getvalue())

def test_error_output(self):
"""Outputs the correct error if ingesting fails."""
"""Outputs the correct error if ingesting fails"""
self.ingest_mock.return_value = {
"success": False,
"messages": ["Something went wrong"],
Expand All @@ -274,6 +297,7 @@ def test_error_output(self):
call_command(
"import_twitter_tweets",
path="/right/path",
archive_version="v1",
stdout=self.out,
stderr=self.out_err,
)
Expand Down

0 comments on commit 81d3c0a

Please sign in to comment.