Remove all time inefficient loops from mlbf

mozilla · Nov 15, 2024 · 42483e3 · 42483e3
1 parent 794c227
commit 42483e3
Show file tree

Hide file tree

Showing 6 changed files with 248 additions and 153 deletions.
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
@@ -60,6 +60,11 @@ jobs:
             services: ''
             compose_file: docker-compose.yml:docker-compose.ci.yml
             run: make test_es_tests
+          -
+            name: MLBF
+            services: ''
+            compose_file: docker-compose.yml:docker-compose.ci.yml
+            run: make test_mlbf
           -
             name: Codestyle
             services: web

diff --git a/Makefile-docker b/Makefile-docker
@@ -163,6 +163,13 @@ test_es: ## run the ES tests
 		-m es_tests \
 		$(ARGS)
 
+.PHONY: test_mlbf
+test_mlbf: ## run the MLBF tests
+	pytest \
+		$(PYTEST_SRC) \
+		-m mlbf_tests \
+		$(ARGS)
+
 .PHONY: test_no_es
 test_no_es: ## run all but the ES tests
 	pytest \

diff --git a/src/olympia/blocklist/mlbf.py b/src/olympia/blocklist/mlbf.py
@@ -2,7 +2,8 @@
 import os
 import secrets
 from enum import Enum
-from typing import Dict, List, Optional, Tuple
+from functools import lru_cache
+from typing import List, Optional, Tuple
 
 from django.utils.functional import cached_property
 
@@ -32,6 +33,41 @@ def ordered_diff_lists(
     return extras, deletes, changed_count
 
 
+def get_not_blocked_items(all_blocks_ids: List[int]) -> List[Tuple[str, str]]:
+    """
+    Returns a list of tuples containing the guid, version of all not blocked
+    versions. We use distinct to avoid duplicates, order by ID to ensure
+    cache.json is always sorted consistently, and return the values as a tuple
+    to make it easier to mock in tests.
+    """
+    return (
+        Version.unfiltered.exclude(id__in=all_blocks_ids or ())
+        .distinct()
+        .order_by('id')
+        .values_list('addon__addonguid__guid', 'version')
+    )
+
+
+def get_all_blocked_items() -> List[Tuple[str, str]]:
+    """
+    Returns a list of tuples containing the guid, version, version_id, and
+    block_type of all blocked items. We use distinct to avoid duplicates,
+    Order by ID to ensure cache.json is always sorted consistently,
+    and return the values as a tuple to make it easier to mock in tests.
+    """
+    return (
+        BlockVersion.objects.filter(version__file__is_signed=True)
+        .distinct()
+        .order_by('id')
+        .values_list(
+            'block__guid',
+            'version__version',
+            'version_id',
+            'block_type',
+        )
+    )
+
+
 def generate_mlbf(stats, blocked, not_blocked):
     log.info('Starting to generating bloomfilter')
 
@@ -136,53 +172,46 @@ def __init__(self, *args, **kwargs):
 
     @cached_property
     def _all_blocks(self):
-        return (
-            BlockVersion.objects.filter(version__file__is_signed=True)
-            .distinct()
-            .order_by('id')
-            .values_list(
-                'block__guid',
-                'version__version',
-                'version_id',
-                'block_type',
-                named=True,
-            )
-        )
+        _blocked_version_ids = []
+        _blocked = {
+            BlockType.BLOCKED: [],
+            BlockType.SOFT_BLOCKED: [],
+        }
 
-    def _format_blocks(self, block_type: BlockType) -> List[str]:
-        return MLBF.hash_filter_inputs(
-            [
-                (version.block__guid, version.version__version)
-                for version in self._all_blocks
-                if version.block_type == block_type
-            ]
-        )
+        # We define get_all_blocked_items as a separate function to allow
+        # mocking the database query in tests to simulate large data sets.
+        for guid, version, id, block_type in get_all_blocked_items():
+            _blocked_version_ids.append(id)
+            _blocked[block_type].append((guid, version))
+
+        return _blocked, _blocked_version_ids
 
     @cached_property
-    def blocked_items(self) -> List[str]:
-        return self._format_blocks(BlockType.BLOCKED)
+    def blocked_items(self):
+        blocks_dict, _ = self._all_blocks
+        return MLBF.hash_filter_inputs(blocks_dict[BlockType.BLOCKED])
 
     @cached_property
-    def soft_blocked_items(self) -> List[str]:
-        return self._format_blocks(BlockType.SOFT_BLOCKED)
+    def soft_blocked_items(self):
+        blocks_dict, _ = self._all_blocks
+        return MLBF.hash_filter_inputs(blocks_dict[BlockType.SOFT_BLOCKED])
 
     @cached_property
-    def not_blocked_items(self) -> List[str]:
-        all_blocks_ids = [version.version_id for version in self._all_blocks]
+    def not_blocked_items(self):
+        _, all_blocks_ids = self._all_blocks
+        # We define not_blocked_items as a separate function to allow
+        # tests to simulate large data sets.
         not_blocked_items = MLBF.hash_filter_inputs(
-            Version.unfiltered.exclude(id__in=all_blocks_ids or ())
-            .distinct()
-            .order_by('id')
-            .values_list('addon__addonguid__guid', 'version')
+            get_not_blocked_items(all_blocks_ids)
         )
+        blocked_items = set(self.blocked_items + self.soft_blocked_items)
         # even though we exclude all the version ids in the query there's an
         # edge case where the version string occurs twice for an addon so we
         # ensure not_blocked_items contain no blocked_items or soft_blocked_items.
-        return [
-            item
-            for item in not_blocked_items
-            if item not in self.blocked_items + self.soft_blocked_items
-        ]
+        filtered_not_blocked_items, _, _ = ordered_diff_lists(
+            blocked_items, not_blocked_items
+        )
+        return filtered_not_blocked_items
 
 
 class MLBF:
@@ -203,8 +232,8 @@ def __init__(
         self.data: BaseMLBFLoader = data_class(storage=self.storage)
 
     @classmethod
-    def hash_filter_inputs(cls, input_list):
-        """Returns a list"""
+    def hash_filter_inputs(cls, input_list: List[Tuple[str, str]]) -> List[str]:
+        """Returns a list of hashed strings"""
         return [
             cls.KEY_FORMAT.format(guid=guid, version=version)
             for (guid, version) in input_list
@@ -236,16 +265,17 @@ def generate_and_write_filter(self):
 
         log.info(json.dumps(stats))
 
+    @lru_cache(maxsize=128)  # noqa: B019
     def generate_diffs(
-        self, previous_mlbf: 'MLBF' = None
-    ) -> Dict[BlockType, Tuple[List[str], List[str], int]]:
-        return {
-            block_type: ordered_diff_lists(
-                [] if previous_mlbf is None else previous_mlbf.data[block_type],
-                self.data[block_type],
-            )
-            for block_type in BlockType
-        }
+        self,
+        block_type: BlockType,
+        previous_mlbf: 'MLBF' = None,
+    ):
+        added, removed, changed_count = ordered_diff_lists(
+            [] if previous_mlbf is None else previous_mlbf.data[block_type],
+            self.data[block_type],
+        )
+        return added, removed, changed_count
 
     def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None):
         """
@@ -272,15 +302,18 @@ def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None):
         in the "unblocked" stash (like for hard blocked items) as this would
         result in the version being in both blocked and unblocked stashes.
         """
-        diffs = self.generate_diffs(previous_mlbf)
-        blocked_added, blocked_removed, _ = diffs[BlockType.BLOCKED]
+        blocked_added, blocked_removed, _ = self.generate_diffs(
+            BlockType.BLOCKED, previous_mlbf
+        )
         stash_json = {
             'blocked': blocked_added,
             'unblocked': blocked_removed,
         }
 
         if waffle.switch_is_active('enable-soft-blocking'):
-            soft_blocked_added, soft_blocked_removed, _ = diffs[BlockType.SOFT_BLOCKED]
+            soft_blocked_added, soft_blocked_removed, _ = self.generate_diffs(
+                BlockType.SOFT_BLOCKED, previous_mlbf
+            )
             stash_json['softblocked'] = soft_blocked_added
             stash_json['unblocked'] = [
                 unblocked
@@ -298,7 +331,7 @@ def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None):
     def blocks_changed_since_previous(
         self, block_type: BlockType = BlockType.BLOCKED, previous_mlbf: 'MLBF' = None
     ):
-        _, _, changed_count = self.generate_diffs(previous_mlbf)[block_type]
+        _, _, changed_count = self.generate_diffs(block_type, previous_mlbf)
         return changed_count
 
     @classmethod

diff --git a/src/olympia/blocklist/tests/test_cron.py b/src/olympia/blocklist/tests/test_cron.py
@@ -34,6 +34,7 @@
 STATSD_PREFIX = 'blocklist.cron.upload_mlbf_to_remote_settings.'
 
 
+@pytest.mark.mlbf_tests
 @freeze_time('2020-01-01 12:34:56')
 @override_switch('blocklist_mlbf_submit', active=True)
 class TestUploadToRemoteSettings(TestCase):
@@ -379,6 +380,46 @@ def test_invalid_cache_results_in_diff(self):
             in self.mocks['olympia.blocklist.cron.upload_filter.delay'].call_args_list
         )
 
+    @mock.patch('olympia.blocklist.mlbf.get_not_blocked_items')
+    @mock.patch('olympia.blocklist.mlbf.get_all_blocked_items')
+    def test_large_data_set(
+        self,
+        mock_get_all_blocked_items,
+        mock_get_not_blocked_items,
+    ):
+        """
+        Test that the cron can handle a large data set
+        We can safely mock the filter cascase initialize and verify methods
+        because they massively impact performance of the test and we don't
+        need to test them here.
+        """
+        two_million_blocked = [
+            (
+                f'{x}@blocked',
+                f'{x}@version',
+                x,
+                # even numbers are blocked, odd numbers are soft blocked
+                BlockType.BLOCKED if x % 2 == 0 else BlockType.SOFT_BLOCKED,
+            )
+            for x in range(0, 2_000_000)
+        ]
+        one_million_not_blocked = [
+            (f'{x}@not_blocked', f'{x}@version') for x in range(2_000_000, 3_000_000)
+        ]
+
+        mock_get_all_blocked_items.return_value = two_million_blocked
+        mock_get_not_blocked_items.return_value = one_million_not_blocked
+
+        upload_mlbf_to_remote_settings()
+
+        mlbf = MLBF.load_from_storage(self.current_time)
+        assert len(mlbf.data.blocked_items) == 1_000_000
+        assert len(mlbf.data.soft_blocked_items) == 1_000_000
+        assert len(mlbf.data.not_blocked_items) == 1_000_000
+
+        with override_switch('enable-soft-blocking', active=True):
+            upload_mlbf_to_remote_settings()
+
 
 class TestTimeMethods(TestCase):
     @freeze_time('2024-10-10 12:34:56')