Skip to content

Commit

Permalink
Re-jig when Flickr photos are saved when fetching.
Browse files Browse the repository at this point in the history
Rather than fetching all the data and then saving the Photos, save them after
fetching a single page (500 photos).

An attempt to reduce the memory usage, but I don't think it achieved anything.

Ref #148
  • Loading branch information
philgyford committed Jul 13, 2016
1 parent a8bd6d1 commit d19aa9e
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 54 deletions.
63 changes: 25 additions & 38 deletions ditto/flickr/fetch/fetchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,6 @@ def __init__(self, account):
# Will be the FlickrAPI object for calling the Flickr API.
self.api = None

# Are the things we're fetching in (possibly) multiple pages?
self.is_paged = False

# Will be incremented if we're fetching multiple pages.
self.page_number = 1

Expand Down Expand Up @@ -96,30 +93,11 @@ def __init__(self, account):
self.return_value['messages'] = ['Account has no API credentials']

def fetch(self, **kwargs):
if self.is_paged:
self._fetch_pages(**kwargs)
else:
self._fetch_page(**kwargs)
self._fetch_pages(**kwargs)

if self._not_failed():
# OK so far; get extra data, if any, before saving.
try:
self._fetch_extra()
except FetchError as e:
self.return_value['success'] = False
self.return_value['messages'] = [
'Error when fetching extra data: %s' % e]

if self._not_failed():
# Still OK; save the data we've got.
try:
self._save_results()
self.return_value['success'] = True
self.return_value['fetched'] += self.results_count
except FetchError as e:
self.return_value['success'] = False
self.return_value['messages'] = [
'Error when saving data: %s' % e]
self.return_value['success'] = True
self.return_value['fetched'] = self.results_count

return self.return_value

Expand All @@ -141,6 +119,26 @@ def _fetch_page(self, **kwargs):
self.return_value['success'] = False
self.return_value['messages'] = [
'Error when calling Flickr API: %s' % e]
return

try:
self._fetch_extra()
except FetchError as e:
self.return_value['success'] = False
self.return_value['messages'] = [
'Error when fetching extra data: %s' % e]
return

try:
self._save_results()
# Clear for the next page:
self.results = []
except FetchError as e:
self.return_value['success'] = False
self.return_value['messages'] = ['Error when saving data: %s' % e]
return

return

def _not_failed(self):
"""Has everything gone smoothly so far? ie, no failure registered?"""
Expand Down Expand Up @@ -263,9 +261,6 @@ def __init__(self, *args, **kwargs):

super().__init__(*args, **kwargs)

# Photos come in pages.
self.is_paged = True

def _call_api(self):
"""
Should call self.api.a_function() and set self.results with the results.
Expand Down Expand Up @@ -365,7 +360,7 @@ def _save_results(self):
saver = PhotoSaver()
for photo in self.results:
p = saver.save_photo(photo)
self.results_count = len(self.results)
self.results_count += len(self.results)


class RecentPhotosFetcher(PhotosFetcher):
Expand Down Expand Up @@ -412,19 +407,12 @@ def _call_api(self):
# First time, set the total_pages there are to fetch.
self.total_pages = int(results['photos']['pages'])


# Add the list of photos' data from this page on to our total list:
self.results += results['photos']['photo']


class PhotosetsFetcher(Fetcher):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

# Photosets come in pages.
self.is_paged = True

def _call_api(self):
"""Fetch one page of results.
Expand All @@ -446,7 +434,6 @@ def _call_api(self):
# First time, set the total_pages there are to fetch.
self.total_pages = int(results['photosets']['pages'])


# Add the list of photosets' data from this page on to our total list:
self.results += results['photosets']['photoset']

Expand Down Expand Up @@ -513,5 +500,5 @@ def _save_results(self):
saver = PhotosetSaver()
for photoset in self.results:
p = saver.save_photoset(photoset)
self.results_count = len(self.results)
self.results_count += len(self.results)

12 changes: 8 additions & 4 deletions docs/services/flickr.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ By default this will only allow the fetching of fully public photos. To fetch al
$ python ditto/scripts/flickr_authorize.py
3. Follow the instructions. A new browser window should open for you to
authorize your Flickr account. You'll then get a code to paste into your
Terminal.
3. Follow the instructions. You should get a URL to paste into a web browser,
in order to authorize your Flickr account. You'll then get a code to paste
into your Terminal.

Finally, for each of those Accounts, note its ID from the Django admin, and run the following management command to fetch information about its associated Flickr ``User`` (replacing ``1`` with your ``Account``'s Django ID, if different):

Expand Down Expand Up @@ -218,12 +218,16 @@ Once the ``Account`` has been created in the Django admin, and its API credentia
Fetch Photos
============

Fetches data about Photos (including videos). This will fetch data for ALL Photos for ALL Accounts (for me it took about 75 minutes for 3,000 photos):
Fetches data about Photos (including videos). This will fetch data for ALL Photos for ALL Accounts:

.. code-block:: shell
$ ./manage.py fetch_flickr_photos --days=all
**NOTE 1:** This took about 75 minutes to fetch data for 3,000 photos on my MacBook.

**NOTE 2:** Trying to run the same thing on a 512MB Digital Ocean machine resulted in the process being killed after fetching about 1,500 photos. See `this bug <https://github.com/philgyford/django-ditto/issues/148>`_.

This will only fetch Photos uploaded in the past 3 days:

.. code-block:: shell
Expand Down
36 changes: 24 additions & 12 deletions tests/flickr/test_fetch_fetchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ def test_requires_valid_account_object(self):
result = Fetcher(account=None)

@patch.object(Fetcher, '_fetch_extra')
@patch.object(Fetcher, '_fetch_page')
def test_returns_false_when_extra_data_fetching_fails(self, fetch_page, fetch_extra):
@patch.object(Fetcher, '_call_api')
def test_returns_false_when_extra_data_fetching_fails(self, call_api, fetch_extra):
fetch_extra.side_effect = FetchError('Oh dear')
account = AccountFactory(user=UserFactory(username='terry'),
api_key='1234', api_secret='9876')
Expand Down Expand Up @@ -146,7 +146,7 @@ def test_failure_if_getinfo_fails(self):

@responses.activate
@patch.object(UserFetcher, '_fetch_and_save_avatar')
def test_makes_one_api_calls(self, fetch_avatar):
def test_makes_one_api_call(self, fetch_avatar):
"Should call people.getInfo"
self.add_response('people.getInfo')
result = UserFetcher(account=self.account).fetch(
Expand Down Expand Up @@ -379,7 +379,9 @@ def test_call_api_error(self):
self.fetcher._call_api()

@responses.activate
def test_fetches_multiple_pages(self):
@patch.object(PhotosFetcher, '_fetch_extra')
@patch.object(PhotoSaver, 'save_photo')
def test_fetches_multiple_pages(self, save_photo, fetch_extra):
"""If the response from the API says there's more than 1 page of
results _fetch_pages() should fetch them all."""
# Alter our default response fixture to set the number of pages to 3:
Expand All @@ -393,11 +395,11 @@ def test_fetches_multiple_pages(self):
self.add_response('people.getPhotos', body=body, querystring={'page': '3'})

with patch('time.sleep'):
self.fetcher._fetch_pages()
results = self.fetcher.fetch(days='all')

self.assertEqual(len(responses.calls), 3)
# Our fixture has 3 photos, so we should now have 9:
self.assertEqual(len(self.fetcher.results), 9)
self.assertEqual(results['fetched'], 9)

@patch.object(PhotosFetcher, '_fetch_pages')
def test_calls_fetch_pages(self, fetch_pages):
Expand Down Expand Up @@ -457,8 +459,7 @@ def test_call_api_error(self):
self.fetcher._call_api()

@responses.activate
@patch.object(PhotosetsFetcher, '_fetch_extra')
def test_fetches_multiple_pages(self, fetch_extra):
def test_fetches_multiple_pages(self):
"""If the response from the API says there's more than 1 page of
results _fetch_pages() should fetch them all."""
# Alter our default response fixture to set the number of pages to 3:
Expand All @@ -473,12 +474,23 @@ def test_fetches_multiple_pages(self, fetch_extra):
self.add_response(
'photosets.getList', body=body, querystring={'page': '3'})

# We need to mock all the calls to fetch the photosets' photos too:
body_2 = json.dumps(self.load_fixture('photosets.getPhotos'))
self.add_response('photosets.getPhotos', body=body_2,
querystring={'photoset_id':'72157665648859705'})
self.add_response('photosets.getPhotos', body=body_2,
querystring={'photoset_id':'72157662491524213'})
self.add_response('photosets.getPhotos', body=body_2,
querystring={'photoset_id':'72157645155015916'})

with patch('time.sleep'):
self.fetcher._fetch_pages()
results = self.fetcher.fetch()

self.assertEqual(len(responses.calls), 3)
# Our fixture has 3 photosets, so we should now have 9:
self.assertEqual(len(self.fetcher.results), 9)
# 3 Photosets x 3 pages each, plus one getPhotos per photoset:
self.assertEqual(len(responses.calls), 12)
# Our fixture has 3 photosets, and we get 3 pages for each
# photoset, so we should now have 9:
self.assertEqual(results['fetched'], 9)

@patch.object(PhotosetsFetcher, '_fetch_pages')
@patch.object(PhotosetsFetcher, '_fetch_extra')
Expand Down

0 comments on commit d19aa9e

Please sign in to comment.