From 56cd5208c6357a868e9c261e70b1b92e06a6ab71 Mon Sep 17 00:00:00 2001 From: Pengyu CHEN Date: Mon, 3 Apr 2017 02:54:16 +0800 Subject: [PATCH] Added: `scrapinghub.hustorage.frontier.Frontier.count_slot` --- scrapinghub/hubstorage/frontier.py | 18 ++++++++++++++++++ tests/hubstorage/test_frontier.py | 9 +++++++++ 2 files changed, 27 insertions(+) diff --git a/scrapinghub/hubstorage/frontier.py b/scrapinghub/hubstorage/frontier.py index 6f74451b..b37caccd 100644 --- a/scrapinghub/hubstorage/frontier.py +++ b/scrapinghub/hubstorage/frontier.py @@ -63,3 +63,21 @@ def delete(self, frontier, slot, ids): def delete_slot(self, frontier, slot): self.apidelete((frontier, 's', slot)) + + def count_slot(self, frontier, slot): + total = { + 'count': 0, + 'scanned': 0, + } + start = None + while True: + ret = list(self.apiget( + (frontier, 's', slot, 'q/count'), + params={'start': start} + )) + total['count'] += ret[0]['count'] + total['scanned'] += ret[0]['scanned'] + start = ret[0].get('nextstart') + if not start: + break + return total diff --git a/tests/hubstorage/test_frontier.py b/tests/hubstorage/test_frontier.py index 18504175..97ad38d7 100644 --- a/tests/hubstorage/test_frontier.py +++ b/tests/hubstorage/test_frontier.py @@ -34,6 +34,7 @@ def test_add_read(hsproject): def test_add_multiple_chunks(hsproject): frontier = hsproject.frontier old_count = frontier.newcount + initial_count = frontier.count_slot(TEST_FRONTIER_NAME, TEST_FRONTIER_SLOT) batch_size = 50 fps1 = [{'fp': '/index_%s.html' % fp} for fp in range(0, batch_size)] @@ -50,6 +51,10 @@ def test_add_multiple_chunks(hsproject): assert frontier.newcount == 150 + old_count + # test count_slot + count = frontier.count_slot(TEST_FRONTIER_NAME, TEST_FRONTIER_SLOT) + assert count['count'] == initial_count['count'] + 150 + # insert repeated fingerprints fps4 = [{'fp': '/index_%s.html' % fp} for fp in range(0, batch_size)] frontier.add(TEST_FRONTIER_NAME, TEST_FRONTIER_SLOT, fps3) @@ -69,6 +74,10 @@ def test_add_multiple_chunks(hsproject): ids = [batch['id'] for batch in batches] frontier.delete(TEST_FRONTIER_NAME, TEST_FRONTIER_SLOT, ids) + # test count_slot again + count = frontier.count_slot(TEST_FRONTIER_NAME, TEST_FRONTIER_SLOT) + assert count['count'] - initial_count['count'] == 150 - 100 + # get remaining 50 batches = list(frontier.read(TEST_FRONTIER_NAME, TEST_FRONTIER_SLOT)) urls = [_get_urls(batch) for batch in batches]