Skip to content

Commit

Permalink
Remove time inefficient loops in mlbf
Browse files Browse the repository at this point in the history
Remove test

Remove all time inefficient loops from mlbf
  • Loading branch information
KevinMind committed Nov 21, 2024
1 parent 644f2e3 commit 0406b1a
Show file tree
Hide file tree
Showing 8 changed files with 289 additions and 165 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ jobs:
services: ''
compose_file: docker-compose.yml:docker-compose.ci.yml
run: make test_es_tests
-
name: MLBF
services: ''
compose_file: docker-compose.yml:docker-compose.ci.yml
run: make test_mlbf
-
name: Codestyle
services: web
Expand Down
7 changes: 7 additions & 0 deletions Makefile-docker
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,13 @@ test_es: ## run the ES tests
-m es_tests \
$(ARGS)

.PHONY: test_mlbf
test_mlbf: ## run the MLBF tests
pytest \
$(PYTEST_SRC) \
-m mlbf_tests \
$(ARGS)

.PHONY: test_no_es
test_no_es: ## run all but the ES tests
pytest \
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ norecursedirs = [
"venv",
"__pycache__",
]
# Disables the timeout globally. We can investigate
# setting a SLA for tests in the future.
timeout = 0
DJANGO_SETTINGS_MODULE = "settings_test"
# Ignoring csp deprecation warnings, we have control over the module and
# currently it warns for child-src which is deprecated in CSPv3 but we're still
Expand Down
3 changes: 3 additions & 0 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -311,3 +311,6 @@ pytest-reportlog==0.4.0 \
django-dbbackup==4.2.1 \
--hash=sha256:157a2ec10d482345cd75092e510ac40d6e2ee6084604a1d17abe178c2f06bc69 \
--hash=sha256:b23265600ead0780ca781b1b4b594949aaa8a20d74f08701f91ee9d7eb1f08cd
pytest-timeout==2.3.1 \
--hash=sha256:12397729125c6ecbdaca01035b9e5239d4db97352320af155b3f5de1ba5165d9 \
--hash=sha256:68188cb703edfc6a18fad98dc25a3c61e9f24d644b0b70f33af545219fc7813e
150 changes: 91 additions & 59 deletions src/olympia/blocklist/mlbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import os
import secrets
from enum import Enum
from typing import Dict, List, Optional, Tuple
from functools import lru_cache
from typing import List, Optional, Tuple

from django.utils.functional import cached_property

Expand All @@ -21,15 +22,47 @@


def ordered_diff_lists(
previous: List[str], current: List[str]
current: List[str],
previous: List[str],
) -> Tuple[List[str], List[str], int]:
current_set = set(current)
previous_set = set(previous)
# Use lists instead of sets to maintain order
extras = [x for x in current if x not in previous_set]
deletes = [x for x in previous if x not in current_set]
changed_count = len(extras) + len(deletes)
return extras, deletes, changed_count
return [x for x in current if x not in previous_set]


def get_not_blocked_items(all_blocked_version_ids: List[int]):
"""
Returns a list of tuples containing the guid, version of all not blocked
versions. We use distinct to avoid duplicates, order by ID to ensure
cache.json is always sorted consistently, and return the values as a tuple
to make it easier to mock in tests.
"""
return (
Version.unfiltered.exclude(id__in=all_blocked_version_ids or ())
.distinct()
.order_by('id')
.values_list('addon__addonguid__guid', 'version')
)


def get_all_blocked_items():
"""
Returns a list of tuples containing the guid, version, version_id, and
block_type of all blocked items. We use distinct to avoid duplicates,
Order by ID to ensure cache.json is always sorted consistently,
and return the values as a tuple to make it easier to mock in tests.
"""
return (
BlockVersion.objects.filter(version__file__is_signed=True)
.distinct()
.order_by('id')
.values_list(
'block__guid',
'version__version',
'version_id',
'block_type',
)
)


def generate_mlbf(stats, blocked, not_blocked):
Expand Down Expand Up @@ -136,50 +169,46 @@ def __init__(self, *args, **kwargs):

@cached_property
def _all_blocks(self):
return (
BlockVersion.objects.filter(version__file__is_signed=True)
.distinct()
.order_by('id')
.values_list(
'block__guid',
'version__version',
'version_id',
'block_type',
named=True,
)
)
_blocked_version_ids = []
_blocked = {
BlockType.BLOCKED: [],
BlockType.SOFT_BLOCKED: [],
}

def _format_blocks(self, block_type: BlockType) -> List[str]:
return MLBF.hash_filter_inputs(
[
(version.block__guid, version.version__version)
for version in self._all_blocks
if version.block_type == block_type
]
)
# We define get_all_blocked_items as a separate function to allow
# mocking the database query in tests to simulate large data sets.
for guid, version_string, version_id, block_type in get_all_blocked_items():
_blocked_version_ids.append(version_id)
_blocked[block_type].append((guid, version_string))

return _blocked, _blocked_version_ids

@cached_property
def blocked_items(self) -> List[str]:
return self._format_blocks(BlockType.BLOCKED)
def blocked_items(self):
blocks_dict, _ = self._all_blocks
return MLBF.hash_filter_inputs(blocks_dict[BlockType.BLOCKED])

@cached_property
def soft_blocked_items(self) -> List[str]:
return self._format_blocks(BlockType.SOFT_BLOCKED)
def soft_blocked_items(self):
blocks_dict, _ = self._all_blocks
return MLBF.hash_filter_inputs(blocks_dict[BlockType.SOFT_BLOCKED])

@cached_property
def not_blocked_items(self) -> List[str]:
all_blocks_ids = [version.version_id for version in self._all_blocks]
def not_blocked_items(self):
_, all_blocked_version_ids = self._all_blocks
# We define not_blocked_items as a separate function to allow
# tests to simulate large data sets.
not_blocked_items = MLBF.hash_filter_inputs(
Version.unfiltered.exclude(id__in=all_blocks_ids or ())
.distinct()
.order_by('id')
.values_list('addon__addonguid__guid', 'version')
get_not_blocked_items(all_blocked_version_ids)
)
blocked_items = set(self.blocked_items + self.soft_blocked_items)
# even though we exclude all the version ids in the query there's an
# edge case where the version string occurs twice for an addon so we
# ensure not_blocked_items contain no blocked_items or soft_blocked_items.
return [item for item in not_blocked_items if item not in blocked_items]
return ordered_diff_lists(
not_blocked_items,
blocked_items,
)


class MLBF:
Expand All @@ -200,8 +229,8 @@ def __init__(
self.data: BaseMLBFLoader = data_class(storage=self.storage)

@classmethod
def hash_filter_inputs(cls, input_list):
"""Returns a list"""
def hash_filter_inputs(cls, input_list: List[Tuple[str, str]]) -> List[str]:
"""Returns a list of hashed strings"""
return [
cls.KEY_FORMAT.format(guid=guid, version=version)
for (guid, version) in input_list
Expand Down Expand Up @@ -233,16 +262,17 @@ def generate_and_write_filter(self):

log.info(json.dumps(stats))

@lru_cache(maxsize=128) # noqa: B019
def generate_diffs(
self, previous_mlbf: 'MLBF' = None
) -> Dict[BlockType, Tuple[List[str], List[str], int]]:
return {
block_type: ordered_diff_lists(
[] if previous_mlbf is None else previous_mlbf.data[block_type],
self.data[block_type],
)
for block_type in BlockType
}
self,
block_type: BlockType,
previous_mlbf: 'MLBF' = None,
):
current = self.data[block_type]
previous = [] if previous_mlbf is None else previous_mlbf.data[block_type]
added = ordered_diff_lists(current, previous)
removed = ordered_diff_lists(previous, current)
return added, removed

def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None):
"""
Expand All @@ -269,21 +299,23 @@ def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None):
in the "unblocked" stash (like for hard blocked items) as this would
result in the version being in both blocked and unblocked stashes.
"""
diffs = self.generate_diffs(previous_mlbf)
blocked_added, blocked_removed, _ = diffs[BlockType.BLOCKED]
blocked_added, blocked_removed = self.generate_diffs(
BlockType.BLOCKED, previous_mlbf
)
stash_json = {
'blocked': blocked_added,
'unblocked': blocked_removed,
}

if waffle.switch_is_active('enable-soft-blocking'):
soft_blocked_added, soft_blocked_removed, _ = diffs[BlockType.SOFT_BLOCKED]
soft_blocked_added, soft_blocked_removed = self.generate_diffs(
BlockType.SOFT_BLOCKED, previous_mlbf
)
stash_json['softblocked'] = soft_blocked_added
stash_json['unblocked'] = [
unblocked
for unblocked in (blocked_removed + soft_blocked_removed)
if unblocked not in blocked_added
]
stash_json['unblocked'] = ordered_diff_lists(
blocked_removed + soft_blocked_removed,
blocked_added,
)

# write stash
stash_path = self.stash_path
Expand All @@ -295,8 +327,8 @@ def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None):
def blocks_changed_since_previous(
self, block_type: BlockType = BlockType.BLOCKED, previous_mlbf: 'MLBF' = None
):
_, _, changed_count = self.generate_diffs(previous_mlbf)[block_type]
return changed_count
added, removed = self.generate_diffs(block_type, previous_mlbf)
return len(added) + len(removed)

@classmethod
def load_from_storage(
Expand Down
43 changes: 43 additions & 0 deletions src/olympia/blocklist/tests/test_cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
STATSD_PREFIX = 'blocklist.cron.upload_mlbf_to_remote_settings.'


@pytest.mark.mlbf_tests
@freeze_time('2020-01-01 12:34:56')
@override_switch('blocklist_mlbf_submit', active=True)
class TestUploadToRemoteSettings(TestCase):
Expand Down Expand Up @@ -379,6 +380,48 @@ def test_invalid_cache_results_in_diff(self):
in self.mocks['olympia.blocklist.cron.upload_filter.delay'].call_args_list
)

# This test is very slow, so we increase the timeout to 180 seconds.
@pytest.mark.timeout(120)
@mock.patch('olympia.blocklist.mlbf.get_not_blocked_items')
@mock.patch('olympia.blocklist.mlbf.get_all_blocked_items')
def test_large_data_set(
self,
mock_get_all_blocked_items,
mock_get_not_blocked_items,
):
"""
Test that the cron can handle a large data set
We can safely mock the filter cascase initialize and verify methods
because they massively impact performance of the test and we don't
need to test them here.
"""
two_million_blocked = [
(
f'{x}@blocked',
f'{x}@version',
x,
# even numbers are blocked, odd numbers are soft blocked
BlockType.BLOCKED if x % 2 == 0 else BlockType.SOFT_BLOCKED,
)
for x in range(0, 2_000_000)
]
one_million_not_blocked = [
(f'{x}@not_blocked', f'{x}@version') for x in range(2_000_000, 3_000_000)
]

mock_get_all_blocked_items.return_value = two_million_blocked
mock_get_not_blocked_items.return_value = one_million_not_blocked

upload_mlbf_to_remote_settings()

mlbf = MLBF.load_from_storage(self.current_time)
assert len(mlbf.data.blocked_items) == 1_000_000
assert len(mlbf.data.soft_blocked_items) == 1_000_000
assert len(mlbf.data.not_blocked_items) == 1_000_000

with override_switch('enable-soft-blocking', active=True):
upload_mlbf_to_remote_settings()


class TestTimeMethods(TestCase):
@freeze_time('2024-10-10 12:34:56')
Expand Down
Loading

0 comments on commit 0406b1a

Please sign in to comment.