Skip to content

Commit

Permalink
Remove all time inefficient loops from mlbf
Browse files Browse the repository at this point in the history
  • Loading branch information
KevinMind committed Nov 15, 2024
1 parent 794c227 commit 42483e3
Show file tree
Hide file tree
Showing 6 changed files with 248 additions and 153 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ jobs:
services: ''
compose_file: docker-compose.yml:docker-compose.ci.yml
run: make test_es_tests
-
name: MLBF
services: ''
compose_file: docker-compose.yml:docker-compose.ci.yml
run: make test_mlbf
-
name: Codestyle
services: web
Expand Down
7 changes: 7 additions & 0 deletions Makefile-docker
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,13 @@ test_es: ## run the ES tests
-m es_tests \
$(ARGS)

.PHONY: test_mlbf
test_mlbf: ## run the MLBF tests
pytest \
$(PYTEST_SRC) \
-m mlbf_tests \
$(ARGS)

.PHONY: test_no_es
test_no_es: ## run all but the ES tests
pytest \
Expand Down
135 changes: 84 additions & 51 deletions src/olympia/blocklist/mlbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import os
import secrets
from enum import Enum
from typing import Dict, List, Optional, Tuple
from functools import lru_cache
from typing import List, Optional, Tuple

from django.utils.functional import cached_property

Expand Down Expand Up @@ -32,6 +33,41 @@ def ordered_diff_lists(
return extras, deletes, changed_count


def get_not_blocked_items(all_blocks_ids: List[int]) -> List[Tuple[str, str]]:
"""
Returns a list of tuples containing the guid, version of all not blocked
versions. We use distinct to avoid duplicates, order by ID to ensure
cache.json is always sorted consistently, and return the values as a tuple
to make it easier to mock in tests.
"""
return (
Version.unfiltered.exclude(id__in=all_blocks_ids or ())
.distinct()
.order_by('id')
.values_list('addon__addonguid__guid', 'version')
)


def get_all_blocked_items() -> List[Tuple[str, str]]:
"""
Returns a list of tuples containing the guid, version, version_id, and
block_type of all blocked items. We use distinct to avoid duplicates,
Order by ID to ensure cache.json is always sorted consistently,
and return the values as a tuple to make it easier to mock in tests.
"""
return (
BlockVersion.objects.filter(version__file__is_signed=True)
.distinct()
.order_by('id')
.values_list(
'block__guid',
'version__version',
'version_id',
'block_type',
)
)


def generate_mlbf(stats, blocked, not_blocked):
log.info('Starting to generating bloomfilter')

Expand Down Expand Up @@ -136,53 +172,46 @@ def __init__(self, *args, **kwargs):

@cached_property
def _all_blocks(self):
return (
BlockVersion.objects.filter(version__file__is_signed=True)
.distinct()
.order_by('id')
.values_list(
'block__guid',
'version__version',
'version_id',
'block_type',
named=True,
)
)
_blocked_version_ids = []
_blocked = {
BlockType.BLOCKED: [],
BlockType.SOFT_BLOCKED: [],
}

def _format_blocks(self, block_type: BlockType) -> List[str]:
return MLBF.hash_filter_inputs(
[
(version.block__guid, version.version__version)
for version in self._all_blocks
if version.block_type == block_type
]
)
# We define get_all_blocked_items as a separate function to allow
# mocking the database query in tests to simulate large data sets.
for guid, version, id, block_type in get_all_blocked_items():
_blocked_version_ids.append(id)
_blocked[block_type].append((guid, version))

return _blocked, _blocked_version_ids

@cached_property
def blocked_items(self) -> List[str]:
return self._format_blocks(BlockType.BLOCKED)
def blocked_items(self):
blocks_dict, _ = self._all_blocks
return MLBF.hash_filter_inputs(blocks_dict[BlockType.BLOCKED])

@cached_property
def soft_blocked_items(self) -> List[str]:
return self._format_blocks(BlockType.SOFT_BLOCKED)
def soft_blocked_items(self):
blocks_dict, _ = self._all_blocks
return MLBF.hash_filter_inputs(blocks_dict[BlockType.SOFT_BLOCKED])

@cached_property
def not_blocked_items(self) -> List[str]:
all_blocks_ids = [version.version_id for version in self._all_blocks]
def not_blocked_items(self):
_, all_blocks_ids = self._all_blocks
# We define not_blocked_items as a separate function to allow
# tests to simulate large data sets.
not_blocked_items = MLBF.hash_filter_inputs(
Version.unfiltered.exclude(id__in=all_blocks_ids or ())
.distinct()
.order_by('id')
.values_list('addon__addonguid__guid', 'version')
get_not_blocked_items(all_blocks_ids)
)
blocked_items = set(self.blocked_items + self.soft_blocked_items)
# even though we exclude all the version ids in the query there's an
# edge case where the version string occurs twice for an addon so we
# ensure not_blocked_items contain no blocked_items or soft_blocked_items.
return [
item
for item in not_blocked_items
if item not in self.blocked_items + self.soft_blocked_items
]
filtered_not_blocked_items, _, _ = ordered_diff_lists(
blocked_items, not_blocked_items
)
return filtered_not_blocked_items


class MLBF:
Expand All @@ -203,8 +232,8 @@ def __init__(
self.data: BaseMLBFLoader = data_class(storage=self.storage)

@classmethod
def hash_filter_inputs(cls, input_list):
"""Returns a list"""
def hash_filter_inputs(cls, input_list: List[Tuple[str, str]]) -> List[str]:
"""Returns a list of hashed strings"""
return [
cls.KEY_FORMAT.format(guid=guid, version=version)
for (guid, version) in input_list
Expand Down Expand Up @@ -236,16 +265,17 @@ def generate_and_write_filter(self):

log.info(json.dumps(stats))

@lru_cache(maxsize=128) # noqa: B019
def generate_diffs(
self, previous_mlbf: 'MLBF' = None
) -> Dict[BlockType, Tuple[List[str], List[str], int]]:
return {
block_type: ordered_diff_lists(
[] if previous_mlbf is None else previous_mlbf.data[block_type],
self.data[block_type],
)
for block_type in BlockType
}
self,
block_type: BlockType,
previous_mlbf: 'MLBF' = None,
):
added, removed, changed_count = ordered_diff_lists(
[] if previous_mlbf is None else previous_mlbf.data[block_type],
self.data[block_type],
)
return added, removed, changed_count

def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None):
"""
Expand All @@ -272,15 +302,18 @@ def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None):
in the "unblocked" stash (like for hard blocked items) as this would
result in the version being in both blocked and unblocked stashes.
"""
diffs = self.generate_diffs(previous_mlbf)
blocked_added, blocked_removed, _ = diffs[BlockType.BLOCKED]
blocked_added, blocked_removed, _ = self.generate_diffs(
BlockType.BLOCKED, previous_mlbf
)
stash_json = {
'blocked': blocked_added,
'unblocked': blocked_removed,
}

if waffle.switch_is_active('enable-soft-blocking'):
soft_blocked_added, soft_blocked_removed, _ = diffs[BlockType.SOFT_BLOCKED]
soft_blocked_added, soft_blocked_removed, _ = self.generate_diffs(
BlockType.SOFT_BLOCKED, previous_mlbf
)
stash_json['softblocked'] = soft_blocked_added
stash_json['unblocked'] = [
unblocked
Expand All @@ -298,7 +331,7 @@ def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None):
def blocks_changed_since_previous(
self, block_type: BlockType = BlockType.BLOCKED, previous_mlbf: 'MLBF' = None
):
_, _, changed_count = self.generate_diffs(previous_mlbf)[block_type]
_, _, changed_count = self.generate_diffs(block_type, previous_mlbf)
return changed_count

@classmethod
Expand Down
41 changes: 41 additions & 0 deletions src/olympia/blocklist/tests/test_cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
STATSD_PREFIX = 'blocklist.cron.upload_mlbf_to_remote_settings.'


@pytest.mark.mlbf_tests
@freeze_time('2020-01-01 12:34:56')
@override_switch('blocklist_mlbf_submit', active=True)
class TestUploadToRemoteSettings(TestCase):
Expand Down Expand Up @@ -379,6 +380,46 @@ def test_invalid_cache_results_in_diff(self):
in self.mocks['olympia.blocklist.cron.upload_filter.delay'].call_args_list
)

@mock.patch('olympia.blocklist.mlbf.get_not_blocked_items')
@mock.patch('olympia.blocklist.mlbf.get_all_blocked_items')
def test_large_data_set(
self,
mock_get_all_blocked_items,
mock_get_not_blocked_items,
):
"""
Test that the cron can handle a large data set
We can safely mock the filter cascase initialize and verify methods
because they massively impact performance of the test and we don't
need to test them here.
"""
two_million_blocked = [
(
f'{x}@blocked',
f'{x}@version',
x,
# even numbers are blocked, odd numbers are soft blocked
BlockType.BLOCKED if x % 2 == 0 else BlockType.SOFT_BLOCKED,
)
for x in range(0, 2_000_000)
]
one_million_not_blocked = [
(f'{x}@not_blocked', f'{x}@version') for x in range(2_000_000, 3_000_000)
]

mock_get_all_blocked_items.return_value = two_million_blocked
mock_get_not_blocked_items.return_value = one_million_not_blocked

upload_mlbf_to_remote_settings()

mlbf = MLBF.load_from_storage(self.current_time)
assert len(mlbf.data.blocked_items) == 1_000_000
assert len(mlbf.data.soft_blocked_items) == 1_000_000
assert len(mlbf.data.not_blocked_items) == 1_000_000

with override_switch('enable-soft-blocking', active=True):
upload_mlbf_to_remote_settings()


class TestTimeMethods(TestCase):
@freeze_time('2024-10-10 12:34:56')
Expand Down
Loading

0 comments on commit 42483e3

Please sign in to comment.