Re-jig when Flickr photos are saved when fetching.

Rather than fetching all the data and then saving the Photos, save them after fetching a single page (500 photos). An attempt to reduce the memory usage, but I don't think it achieved anything. Ref #148
philgyford · Jul 13, 2016 · d19aa9e · d19aa9e
1 parent a8bd6d1
commit d19aa9e
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 54 deletions.
diff --git a/ditto/flickr/fetch/fetchers.py b/ditto/flickr/fetch/fetchers.py
@@ -54,9 +54,6 @@ def __init__(self, account):
         # Will be the FlickrAPI object for calling the Flickr API.
         self.api = None
 
-        # Are the things we're fetching in (possibly) multiple pages?
-        self.is_paged = False
-
         # Will be incremented if we're fetching multiple pages.
         self.page_number = 1
 
@@ -96,30 +93,11 @@ def __init__(self, account):
             self.return_value['messages'] = ['Account has no API credentials']
 
     def fetch(self, **kwargs):
-        if self.is_paged:
-            self._fetch_pages(**kwargs)
-        else:
-            self._fetch_page(**kwargs)
+        self._fetch_pages(**kwargs)
 
         if self._not_failed():
-            # OK so far; get extra data, if any, before saving.
-            try:
-                self._fetch_extra()
-            except FetchError as e:
-                self.return_value['success'] = False
-                self.return_value['messages'] = [
-                                    'Error when fetching extra data: %s' % e]
-
-        if self._not_failed():
-            # Still OK; save the data we've got.
-            try:
-                self._save_results()
-                self.return_value['success'] = True
-                self.return_value['fetched'] += self.results_count
-            except FetchError as e:
-                self.return_value['success'] = False
-                self.return_value['messages'] = [
-                                            'Error when saving data: %s' % e]
+            self.return_value['success'] = True
+            self.return_value['fetched'] = self.results_count
 
         return self.return_value
 
@@ -141,6 +119,26 @@ def _fetch_page(self, **kwargs):
             self.return_value['success'] = False
             self.return_value['messages'] = [
                                     'Error when calling Flickr API: %s' % e]
+            return
+
+        try:
+            self._fetch_extra()
+        except FetchError as e:
+            self.return_value['success'] = False
+            self.return_value['messages'] = [
+                                    'Error when fetching extra data: %s' % e]
+            return
+
+        try:
+            self._save_results()
+            # Clear for the next page:
+            self.results = []
+        except FetchError as e:
+            self.return_value['success'] = False
+            self.return_value['messages'] = ['Error when saving data: %s' % e]
+            return
+
+        return
 
     def _not_failed(self):
         """Has everything gone smoothly so far? ie, no failure registered?"""
@@ -263,9 +261,6 @@ def __init__(self, *args, **kwargs):
 
         super().__init__(*args, **kwargs)
 
-        # Photos come in pages.
-        self.is_paged = True
-
     def _call_api(self):
         """
         Should call self.api.a_function() and set self.results with the results.
@@ -365,7 +360,7 @@ def _save_results(self):
         saver = PhotoSaver()
         for photo in self.results:
             p = saver.save_photo(photo)
-        self.results_count = len(self.results)
+        self.results_count += len(self.results)
 
 
 class RecentPhotosFetcher(PhotosFetcher):
@@ -412,19 +407,12 @@ def _call_api(self):
             # First time, set the total_pages there are to fetch.
             self.total_pages = int(results['photos']['pages'])
 
-
         # Add the list of photos' data from this page on to our total list:
         self.results += results['photos']['photo']
 
 
 class PhotosetsFetcher(Fetcher):
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # Photosets come in pages.
-        self.is_paged = True
-
     def _call_api(self):
         """Fetch one page of results.
 
@@ -446,7 +434,6 @@ def _call_api(self):
             # First time, set the total_pages there are to fetch.
             self.total_pages = int(results['photosets']['pages'])
 
-
         # Add the list of photosets' data from this page on to our total list:
         self.results += results['photosets']['photoset']
 
@@ -513,5 +500,5 @@ def _save_results(self):
         saver = PhotosetSaver()
         for photoset in self.results:
             p = saver.save_photoset(photoset)
-        self.results_count = len(self.results)
+        self.results_count += len(self.results)
 
diff --git a/docs/services/flickr.rst b/docs/services/flickr.rst
@@ -23,9 +23,9 @@ By default this will only allow the fetching of fully public photos. To fetch al
 
         $ python ditto/scripts/flickr_authorize.py
 
-3. Follow the instructions. A new browser window should open for you to
-   authorize your Flickr account. You'll then get a code to paste into your
-   Terminal.
+3. Follow the instructions. You should get a URL to paste into a web browser,
+   in order to authorize your Flickr account. You'll then get a code to paste
+   into your Terminal.
 
 Finally, for each of those Accounts, note its ID from the Django admin, and run the following management command to fetch information about its associated Flickr ``User`` (replacing ``1`` with your ``Account``'s Django ID, if different):
 
@@ -218,12 +218,16 @@ Once the ``Account`` has been created in the Django admin, and its API credentia
 Fetch Photos
 ============
 
-Fetches data about Photos (including videos). This will fetch data for ALL Photos for ALL Accounts (for me it took about 75 minutes for 3,000 photos):
+Fetches data about Photos (including videos). This will fetch data for ALL Photos for ALL Accounts:
 
 .. code-block:: shell
 
     $ ./manage.py fetch_flickr_photos --days=all
 
+**NOTE 1:** This took about 75 minutes to fetch data for 3,000 photos on my MacBook.
+
+**NOTE 2:** Trying to run the same thing on a 512MB Digital Ocean machine resulted in the process being killed after fetching about 1,500 photos. See `this bug <https://github.com/philgyford/django-ditto/issues/148>`_.
+
 This will only fetch Photos uploaded in the past 3 days:
 
 .. code-block:: shell

diff --git a/tests/flickr/test_fetch_fetchers.py b/tests/flickr/test_fetch_fetchers.py
@@ -57,8 +57,8 @@ def test_requires_valid_account_object(self):
             result = Fetcher(account=None)
 
     @patch.object(Fetcher, '_fetch_extra')
-    @patch.object(Fetcher, '_fetch_page')
-    def test_returns_false_when_extra_data_fetching_fails(self, fetch_page, fetch_extra):
+    @patch.object(Fetcher, '_call_api')
+    def test_returns_false_when_extra_data_fetching_fails(self, call_api, fetch_extra):
         fetch_extra.side_effect = FetchError('Oh dear')
         account = AccountFactory(user=UserFactory(username='terry'),
                                             api_key='1234', api_secret='9876')
@@ -146,7 +146,7 @@ def test_failure_if_getinfo_fails(self):
 
     @responses.activate
     @patch.object(UserFetcher, '_fetch_and_save_avatar')
-    def test_makes_one_api_calls(self, fetch_avatar):
+    def test_makes_one_api_call(self, fetch_avatar):
         "Should call people.getInfo"
         self.add_response('people.getInfo')
         result = UserFetcher(account=self.account).fetch(
@@ -379,7 +379,9 @@ def test_call_api_error(self):
             self.fetcher._call_api()
 
     @responses.activate
-    def test_fetches_multiple_pages(self):
+    @patch.object(PhotosFetcher, '_fetch_extra')
+    @patch.object(PhotoSaver, 'save_photo')
+    def test_fetches_multiple_pages(self, save_photo, fetch_extra):
         """If the response from the API says there's more than 1 page of
         results _fetch_pages() should fetch them all."""
         # Alter our default response fixture to set the number of pages to 3:
@@ -393,11 +395,11 @@ def test_fetches_multiple_pages(self):
         self.add_response('people.getPhotos', body=body, querystring={'page': '3'})
 
         with patch('time.sleep'):
-            self.fetcher._fetch_pages()
+            results = self.fetcher.fetch(days='all')
 
             self.assertEqual(len(responses.calls), 3)
             # Our fixture has 3 photos, so we should now have 9:
-            self.assertEqual(len(self.fetcher.results), 9)
+            self.assertEqual(results['fetched'], 9)
 
     @patch.object(PhotosFetcher, '_fetch_pages')
     def test_calls_fetch_pages(self, fetch_pages):
@@ -457,8 +459,7 @@ def test_call_api_error(self):
             self.fetcher._call_api()
 
     @responses.activate
-    @patch.object(PhotosetsFetcher, '_fetch_extra')
-    def test_fetches_multiple_pages(self, fetch_extra):
+    def test_fetches_multiple_pages(self):
         """If the response from the API says there's more than 1 page of
         results _fetch_pages() should fetch them all."""
         # Alter our default response fixture to set the number of pages to 3:
@@ -473,12 +474,23 @@ def test_fetches_multiple_pages(self, fetch_extra):
         self.add_response(
                     'photosets.getList', body=body, querystring={'page': '3'})
 
+        # We need to mock all the calls to fetch the photosets' photos too:
+        body_2 = json.dumps(self.load_fixture('photosets.getPhotos'))
+        self.add_response('photosets.getPhotos', body=body_2,
+                            querystring={'photoset_id':'72157665648859705'})
+        self.add_response('photosets.getPhotos', body=body_2,
+                            querystring={'photoset_id':'72157662491524213'})
+        self.add_response('photosets.getPhotos', body=body_2,
+                            querystring={'photoset_id':'72157645155015916'})
+
         with patch('time.sleep'):
-            self.fetcher._fetch_pages()
+            results = self.fetcher.fetch()
 
-            self.assertEqual(len(responses.calls), 3)
-            # Our fixture has 3 photosets, so we should now have 9:
-            self.assertEqual(len(self.fetcher.results), 9)
+            # 3 Photosets x 3 pages each, plus one getPhotos per photoset:
+            self.assertEqual(len(responses.calls), 12)
+            # Our fixture has 3 photosets, and we get 3 pages for each
+            # photoset, so we should now have 9:
+            self.assertEqual(results['fetched'], 9)
 
     @patch.object(PhotosetsFetcher, '_fetch_pages')
     @patch.object(PhotosetsFetcher, '_fetch_extra')