From b5495d280e115fce854552bfc77a8fc5272ada88 Mon Sep 17 00:00:00 2001 From: Kevin Meinhardt Date: Mon, 18 Nov 2024 13:21:06 +0100 Subject: [PATCH] Remove all time inefficient loops from mlbf --- .github/workflows/_test.yml | 5 + Makefile-docker | 7 + src/olympia/blocklist/mlbf.py | 136 ++++++++------ src/olympia/blocklist/tests/test_cron.py | 41 +++++ src/olympia/blocklist/tests/test_mlbf.py | 212 +++++++++++----------- src/olympia/blocklist/tests/test_tasks.py | 1 + 6 files changed, 246 insertions(+), 156 deletions(-) diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml index cf2477cbf56a..4cd0d96c7dea 100644 --- a/.github/workflows/_test.yml +++ b/.github/workflows/_test.yml @@ -60,6 +60,11 @@ jobs: services: '' compose_file: docker-compose.yml:docker-compose.ci.yml run: make test_es_tests + - + name: MLBF + services: '' + compose_file: docker-compose.yml:docker-compose.ci.yml + run: make test_mlbf - name: Codestyle services: web diff --git a/Makefile-docker b/Makefile-docker index 4c1a2ce80c47..7b824f07cc8a 100644 --- a/Makefile-docker +++ b/Makefile-docker @@ -163,6 +163,13 @@ test_es: ## run the ES tests -m es_tests \ $(ARGS) +.PHONY: test_mlbf +test_mlbf: ## run the MLBF tests + pytest \ + $(PYTEST_SRC) \ + -m mlbf_tests \ + $(ARGS) + .PHONY: test_no_es test_no_es: ## run all but the ES tests pytest \ diff --git a/src/olympia/blocklist/mlbf.py b/src/olympia/blocklist/mlbf.py index b17ae8ec57c5..dc0f0f39a8a9 100644 --- a/src/olympia/blocklist/mlbf.py +++ b/src/olympia/blocklist/mlbf.py @@ -2,7 +2,8 @@ import os import secrets from enum import Enum -from typing import Dict, List, Optional, Tuple +from functools import lru_cache +from typing import List, Optional, Tuple from django.utils.functional import cached_property @@ -21,15 +22,45 @@ def ordered_diff_lists( - previous: List[str], current: List[str] + current: List[str], previous: List[str], ) -> Tuple[List[str], List[str], int]: - current_set = set(current) previous_set = set(previous) # Use lists instead of sets to maintain order - extras = [x for x in current if x not in previous_set] - deletes = [x for x in previous if x not in current_set] - changed_count = len(extras) + len(deletes) - return extras, deletes, changed_count + return [x for x in current if x not in previous_set] + +def get_not_blocked_items(all_blocks_ids: List[int]) -> List[Tuple[str, str]]: + """ + Returns a list of tuples containing the guid, version of all not blocked + versions. We use distinct to avoid duplicates, order by ID to ensure + cache.json is always sorted consistently, and return the values as a tuple + to make it easier to mock in tests. + """ + return ( + Version.unfiltered.exclude(id__in=all_blocks_ids or ()) + .distinct() + .order_by('id') + .values_list('addon__addonguid__guid', 'version') + ) + + +def get_all_blocked_items() -> List[Tuple[str, str]]: + """ + Returns a list of tuples containing the guid, version, version_id, and + block_type of all blocked items. We use distinct to avoid duplicates, + Order by ID to ensure cache.json is always sorted consistently, + and return the values as a tuple to make it easier to mock in tests. + """ + return ( + BlockVersion.objects.filter(version__file__is_signed=True) + .distinct() + .order_by('id') + .values_list( + 'block__guid', + 'version__version', + 'version_id', + 'block_type', + ) + ) def generate_mlbf(stats, blocked, not_blocked): @@ -136,53 +167,45 @@ def __init__(self, *args, **kwargs): @cached_property def _all_blocks(self): - return ( - BlockVersion.objects.filter(version__file__is_signed=True) - .distinct() - .order_by('id') - .values_list( - 'block__guid', - 'version__version', - 'version_id', - 'block_type', - named=True, - ) - ) + _blocked_version_ids = [] + _blocked = { + BlockType.BLOCKED: [], + BlockType.SOFT_BLOCKED: [], + } - def _format_blocks(self, block_type: BlockType) -> List[str]: - return MLBF.hash_filter_inputs( - [ - (version.block__guid, version.version__version) - for version in self._all_blocks - if version.block_type == block_type - ] - ) + # We define get_all_blocked_items as a separate function to allow + # mocking the database query in tests to simulate large data sets. + for guid, version, id, block_type in get_all_blocked_items(): + _blocked_version_ids.append(id) + _blocked[block_type].append((guid, version)) + + return _blocked, _blocked_version_ids @cached_property - def blocked_items(self) -> List[str]: - return self._format_blocks(BlockType.BLOCKED) + def blocked_items(self): + blocks_dict, _ = self._all_blocks + return MLBF.hash_filter_inputs(blocks_dict[BlockType.BLOCKED]) @cached_property - def soft_blocked_items(self) -> List[str]: - return self._format_blocks(BlockType.SOFT_BLOCKED) + def soft_blocked_items(self): + blocks_dict, _ = self._all_blocks + return MLBF.hash_filter_inputs(blocks_dict[BlockType.SOFT_BLOCKED]) @cached_property - def not_blocked_items(self) -> List[str]: - all_blocks_ids = [version.version_id for version in self._all_blocks] + def not_blocked_items(self): + _, all_blocks_ids = self._all_blocks + # We define not_blocked_items as a separate function to allow + # tests to simulate large data sets. not_blocked_items = MLBF.hash_filter_inputs( - Version.unfiltered.exclude(id__in=all_blocks_ids or ()) - .distinct() - .order_by('id') - .values_list('addon__addonguid__guid', 'version') + get_not_blocked_items(all_blocks_ids) ) blocked_items = set(self.blocked_items + self.soft_blocked_items) # even though we exclude all the version ids in the query there's an # edge case where the version string occurs twice for an addon so we # ensure not_blocked_items contain no blocked_items or soft_blocked_items. - newly_not_blocked_items, _, _ = ordered_diff_lists( + return ordered_diff_lists( blocked_items, not_blocked_items ) - return newly_not_blocked_items class MLBF: @@ -203,8 +226,8 @@ def __init__( self.data: BaseMLBFLoader = data_class(storage=self.storage) @classmethod - def hash_filter_inputs(cls, input_list): - """Returns a list""" + def hash_filter_inputs(cls, input_list: List[Tuple[str, str]]) -> List[str]: + """Returns a list of hashed strings""" return [ cls.KEY_FORMAT.format(guid=guid, version=version) for (guid, version) in input_list @@ -236,16 +259,18 @@ def generate_and_write_filter(self): log.info(json.dumps(stats)) + @lru_cache(maxsize=128) # noqa: B019 def generate_diffs( - self, previous_mlbf: 'MLBF' = None - ) -> Dict[BlockType, Tuple[List[str], List[str], int]]: - return { - block_type: ordered_diff_lists( - [] if previous_mlbf is None else previous_mlbf.data[block_type], - self.data[block_type], - ) - for block_type in BlockType - } + self, + block_type: BlockType, + previous_mlbf: 'MLBF' = None, + ): + current = self.data[block_type] + previous = [] if previous_mlbf is None else previous_mlbf.data[block_type] + added = ordered_diff_lists(current, previous) + removed = ordered_diff_lists(previous, current) + changed_count = len(added) + len(removed) + return added, removed, changed_count def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None): """ @@ -272,15 +297,18 @@ def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None): in the "unblocked" stash (like for hard blocked items) as this would result in the version being in both blocked and unblocked stashes. """ - diffs = self.generate_diffs(previous_mlbf) - blocked_added, blocked_removed, _ = diffs[BlockType.BLOCKED] + blocked_added, blocked_removed, _ = self.generate_diffs( + BlockType.BLOCKED, previous_mlbf + ) stash_json = { 'blocked': blocked_added, 'unblocked': blocked_removed, } if waffle.switch_is_active('enable-soft-blocking'): - soft_blocked_added, soft_blocked_removed, _ = diffs[BlockType.SOFT_BLOCKED] + soft_blocked_added, soft_blocked_removed, _ = self.generate_diffs( + BlockType.SOFT_BLOCKED, previous_mlbf + ) stash_json['softblocked'] = soft_blocked_added stash_json['unblocked'] = [ unblocked @@ -298,7 +326,7 @@ def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None): def blocks_changed_since_previous( self, block_type: BlockType = BlockType.BLOCKED, previous_mlbf: 'MLBF' = None ): - _, _, changed_count = self.generate_diffs(previous_mlbf)[block_type] + _, _, changed_count = self.generate_diffs(block_type, previous_mlbf) return changed_count @classmethod diff --git a/src/olympia/blocklist/tests/test_cron.py b/src/olympia/blocklist/tests/test_cron.py index 0441c64dd3a8..b3816c91c307 100644 --- a/src/olympia/blocklist/tests/test_cron.py +++ b/src/olympia/blocklist/tests/test_cron.py @@ -34,6 +34,7 @@ STATSD_PREFIX = 'blocklist.cron.upload_mlbf_to_remote_settings.' +@pytest.mark.mlbf_tests @freeze_time('2020-01-01 12:34:56') @override_switch('blocklist_mlbf_submit', active=True) class TestUploadToRemoteSettings(TestCase): @@ -379,6 +380,46 @@ def test_invalid_cache_results_in_diff(self): in self.mocks['olympia.blocklist.cron.upload_filter.delay'].call_args_list ) + @mock.patch('olympia.blocklist.mlbf.get_not_blocked_items') + @mock.patch('olympia.blocklist.mlbf.get_all_blocked_items') + def test_large_data_set( + self, + mock_get_all_blocked_items, + mock_get_not_blocked_items, + ): + """ + Test that the cron can handle a large data set + We can safely mock the filter cascase initialize and verify methods + because they massively impact performance of the test and we don't + need to test them here. + """ + two_million_blocked = [ + ( + f'{x}@blocked', + f'{x}@version', + x, + # even numbers are blocked, odd numbers are soft blocked + BlockType.BLOCKED if x % 2 == 0 else BlockType.SOFT_BLOCKED, + ) + for x in range(0, 2_000_000) + ] + one_million_not_blocked = [ + (f'{x}@not_blocked', f'{x}@version') for x in range(2_000_000, 3_000_000) + ] + + mock_get_all_blocked_items.return_value = two_million_blocked + mock_get_not_blocked_items.return_value = one_million_not_blocked + + upload_mlbf_to_remote_settings() + + mlbf = MLBF.load_from_storage(self.current_time) + assert len(mlbf.data.blocked_items) == 1_000_000 + assert len(mlbf.data.soft_blocked_items) == 1_000_000 + assert len(mlbf.data.not_blocked_items) == 1_000_000 + + with override_switch('enable-soft-blocking', active=True): + upload_mlbf_to_remote_settings() + class TestTimeMethods(TestCase): @freeze_time('2024-10-10 12:34:56') diff --git a/src/olympia/blocklist/tests/test_mlbf.py b/src/olympia/blocklist/tests/test_mlbf.py index af2e4adfc1a0..4918b3a675e3 100644 --- a/src/olympia/blocklist/tests/test_mlbf.py +++ b/src/olympia/blocklist/tests/test_mlbf.py @@ -1,6 +1,7 @@ import json from functools import cached_property +import pytest from waffle.testutils import override_switch from olympia import amo @@ -29,6 +30,7 @@ class _MLBFBase(TestCase): def setUp(self): self.storage = SafeStorage() self.user = user_factory() + self.empty_diff = ([], [], 0) def _blocked_addon(self, block_type=BlockType.BLOCKED, **kwargs): addon = addon_factory(**kwargs) @@ -261,6 +263,7 @@ def test_hash_filter_inputs_returns_set_of_hashed_strings(self): assert default == ['guid:version'] +@pytest.mark.mlbf_tests class TestMLBF(_MLBFBase): def test_get_data_from_db(self): self._blocked_addon() @@ -333,28 +336,28 @@ def test_diff_returns_stateless_without_previous(self): addon, _ = self._blocked_addon(file_kw={'is_signed': True}) base_mlbf = MLBF.generate_from_db('base') - stateless_diff = { - BlockType.BLOCKED: ( - MLBF.hash_filter_inputs( - [(addon.block.guid, addon.current_version.version)] - ), - [], - 1, + blocked_diff = ( + MLBF.hash_filter_inputs( + [(addon.block.guid, addon.current_version.version)] ), - BlockType.SOFT_BLOCKED: ([], [], 0), - } + [], + 1, + ) - assert base_mlbf.generate_diffs() == stateless_diff + assert base_mlbf.generate_diffs(BlockType.BLOCKED) == blocked_diff + assert base_mlbf.generate_diffs(BlockType.SOFT_BLOCKED) == self.empty_diff next_mlbf = MLBF.generate_from_db('next') # If we don't include the base_mlbf, unblocked version will still be in the diff - assert next_mlbf.generate_diffs() == stateless_diff + assert next_mlbf.generate_diffs(BlockType.BLOCKED) == blocked_diff + assert next_mlbf.generate_diffs(BlockType.SOFT_BLOCKED) == self.empty_diff # Providing a previous mlbf with the unblocked version already included # removes it from the diff - assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == { - BlockType.BLOCKED: ([], [], 0), - BlockType.SOFT_BLOCKED: ([], [], 0), - } + assert next_mlbf.generate_diffs(BlockType.BLOCKED, base_mlbf) == self.empty_diff + assert ( + next_mlbf.generate_diffs(BlockType.SOFT_BLOCKED, base_mlbf) + == self.empty_diff + ) def test_diff_no_changes(self): addon, block = self._blocked_addon() @@ -362,10 +365,11 @@ def test_diff_no_changes(self): base_mlbf = MLBF.generate_from_db('test') next_mlbf = MLBF.generate_from_db('test_two') - assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == { - BlockType.BLOCKED: ([], [], 0), - BlockType.SOFT_BLOCKED: ([], [], 0), - } + assert next_mlbf.generate_diffs(BlockType.BLOCKED, base_mlbf) == self.empty_diff + assert ( + next_mlbf.generate_diffs(BlockType.SOFT_BLOCKED, base_mlbf) + == self.empty_diff + ) def test_diff_block_added(self): addon, block = self._blocked_addon() @@ -377,16 +381,17 @@ def test_diff_block_added(self): next_mlbf = MLBF.generate_from_db('test_two') - assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == { - BlockType.BLOCKED: ( - MLBF.hash_filter_inputs( - [(new_block.block.guid, new_block.version.version)] - ), - [], - 1, + assert next_mlbf.generate_diffs(BlockType.BLOCKED, base_mlbf) == ( + MLBF.hash_filter_inputs( + [(new_block.block.guid, new_block.version.version)] ), - BlockType.SOFT_BLOCKED: ([], [], 0), - } + [], + 1, + ) + assert ( + next_mlbf.generate_diffs(BlockType.SOFT_BLOCKED, base_mlbf) + == self.empty_diff + ) def test_diff_block_removed(self): addon, block = self._blocked_addon() @@ -397,16 +402,17 @@ def test_diff_block_removed(self): block_version.delete() next_mlbf = MLBF.generate_from_db('test_two') - assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == { - BlockType.BLOCKED: ( - [], - MLBF.hash_filter_inputs( - [(block_version.block.guid, block_version.version.version)] - ), - 1, + assert next_mlbf.generate_diffs(BlockType.BLOCKED, base_mlbf) == ( + [], + MLBF.hash_filter_inputs( + [(block_version.block.guid, block_version.version.version)] ), - BlockType.SOFT_BLOCKED: ([], [], 0), - } + 1, + ) + assert ( + next_mlbf.generate_diffs(BlockType.SOFT_BLOCKED, base_mlbf) + == self.empty_diff + ) def test_diff_block_added_and_removed(self): addon, block = self._blocked_addon() @@ -422,18 +428,19 @@ def test_diff_block_added_and_removed(self): next_mlbf = MLBF.generate_from_db('test_two') - assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == { - BlockType.BLOCKED: ( - MLBF.hash_filter_inputs( - [(new_block.block.guid, new_block.version.version)] - ), - MLBF.hash_filter_inputs( - [(block_version.block.guid, block_version.version.version)] - ), - 2, + assert next_mlbf.generate_diffs(BlockType.BLOCKED, base_mlbf) == ( + MLBF.hash_filter_inputs( + [(new_block.block.guid, new_block.version.version)] ), - BlockType.SOFT_BLOCKED: ([], [], 0), - } + MLBF.hash_filter_inputs( + [(block_version.block.guid, block_version.version.version)] + ), + 2, + ) + assert ( + next_mlbf.generate_diffs(BlockType.SOFT_BLOCKED, base_mlbf) + == self.empty_diff + ) def test_diff_block_hard_to_soft(self): addon, block = self._blocked_addon() @@ -444,22 +451,20 @@ def test_diff_block_hard_to_soft(self): block_version.update(block_type=BlockType.SOFT_BLOCKED) next_mlbf = MLBF.generate_from_db('test_two') - assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == { - BlockType.BLOCKED: ( - [], - MLBF.hash_filter_inputs( - [(block_version.block.guid, block_version.version.version)] - ), - 1, + assert next_mlbf.generate_diffs(BlockType.BLOCKED, base_mlbf) == ( + [], + MLBF.hash_filter_inputs( + [(block_version.block.guid, block_version.version.version)] ), - BlockType.SOFT_BLOCKED: ( - MLBF.hash_filter_inputs( - [(block_version.block.guid, block_version.version.version)] - ), - [], - 1, + 1, + ) + assert next_mlbf.generate_diffs(BlockType.SOFT_BLOCKED, base_mlbf) == ( + MLBF.hash_filter_inputs( + [(block_version.block.guid, block_version.version.version)] ), - } + [], + 1, + ) def test_diff_block_soft_to_hard(self): addon, block = self._blocked_addon() @@ -470,22 +475,20 @@ def test_diff_block_soft_to_hard(self): block_version.update(block_type=BlockType.BLOCKED) next_mlbf = MLBF.generate_from_db('test_two') - assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == { - BlockType.BLOCKED: ( - MLBF.hash_filter_inputs( - [(block_version.block.guid, block_version.version.version)] - ), - [], - 1, + assert next_mlbf.generate_diffs(BlockType.BLOCKED, base_mlbf) == ( + MLBF.hash_filter_inputs( + [(block_version.block.guid, block_version.version.version)] ), - BlockType.SOFT_BLOCKED: ( - [], - MLBF.hash_filter_inputs( - [(block_version.block.guid, block_version.version.version)] - ), - 1, + [], + 1, + ) + assert next_mlbf.generate_diffs(BlockType.SOFT_BLOCKED, base_mlbf) == ( + [], + MLBF.hash_filter_inputs( + [(block_version.block.guid, block_version.version.version)] ), - } + 1, + ) def test_diff_invalid_cache(self): addon, block = self._blocked_addon(file_kw={'is_signed': True}) @@ -507,16 +510,18 @@ def test_diff_invalid_cache(self): # The diff should include the soft blocked version because it was removed # and should not include the blocked version because it was not changed - assert mlbf.generate_diffs(previous_mlbf=previous_mlbf) == { - BlockType.BLOCKED: ([], [], 0), - BlockType.SOFT_BLOCKED: ( - MLBF.hash_filter_inputs( - [(soft_blocked.block.guid, soft_blocked.version.version)] - ), - [], - 1, + assert mlbf.generate_diffs(BlockType.BLOCKED, previous_mlbf) == ( + [], + [], + 0, + ) + assert mlbf.generate_diffs(BlockType.SOFT_BLOCKED, previous_mlbf) == ( + MLBF.hash_filter_inputs( + [(soft_blocked.block.guid, soft_blocked.version.version)] ), - } + [], + 1, + ) def test_diff_all_possible_changes(self): """ @@ -570,10 +575,16 @@ def test_diff_all_possible_changes(self): first_mlbf = MLBF.generate_from_db('first') # We expect the 4 blocked versions to be in the diff, sorted by block type. - assert first_mlbf.generate_diffs() == { - BlockType.BLOCKED: ([five_hash, six_hash], [], 2), - BlockType.SOFT_BLOCKED: ([three_hash, four_hash], [], 2), - } + assert first_mlbf.generate_diffs(BlockType.BLOCKED) == ( + [five_hash, six_hash], + [], + 2, + ) + assert first_mlbf.generate_diffs(BlockType.SOFT_BLOCKED) == ( + [three_hash, four_hash], + [], + 2, + ) # The first time we generate the stash, we expect 3 to 6 to be in the stash # as they have some kind of block applied. @@ -614,19 +625,16 @@ def test_diff_all_possible_changes(self): # The order is based on the ID (i.e. creation time) of the block, # not the version so we expect two after three since two was # blocked after three. - assert second_mlbf.generate_diffs(previous_mlbf=first_mlbf) == { - BlockType.BLOCKED: ( - [three_hash, two_hash], - [five_hash, six_hash], - 4, - ), - # Same as above, one had a block created after five so it comes second. - BlockType.SOFT_BLOCKED: ( - [five_hash, one_hash], - [three_hash, four_hash], - 4, - ), - } + assert second_mlbf.generate_diffs(BlockType.BLOCKED, first_mlbf) == ( + [three_hash, two_hash], + [five_hash, six_hash], + 4, + ) + assert second_mlbf.generate_diffs(BlockType.SOFT_BLOCKED, first_mlbf) == ( + [five_hash, one_hash], + [three_hash, four_hash], + 4, + ) assert second_mlbf.generate_and_write_stash(previous_mlbf=first_mlbf) == { 'blocked': [three_hash, two_hash], diff --git a/src/olympia/blocklist/tests/test_tasks.py b/src/olympia/blocklist/tests/test_tasks.py index cf17fc07dca1..92dede100230 100644 --- a/src/olympia/blocklist/tests/test_tasks.py +++ b/src/olympia/blocklist/tests/test_tasks.py @@ -124,6 +124,7 @@ def test_calls_save_to_block_objects_or_delete_block_objects_depending_on_action @pytest.mark.django_db +@pytest.mark.mlbf_tests class TestUploadMLBFToRemoteSettings(TestCase): def setUp(self): self.user = user_factory()