Skip to content

Commit

Permalink
Improve the KATalogus /plugins endpoint performance (#3892)
Browse files Browse the repository at this point in the history
Co-authored-by: Jeroen Dekkers <[email protected]>
Co-authored-by: Jan Klopper <[email protected]>
  • Loading branch information
3 people authored Dec 3, 2024
1 parent 25aa493 commit 59df219
Show file tree
Hide file tree
Showing 20 changed files with 227 additions and 122 deletions.
2 changes: 1 addition & 1 deletion boefjes/.ci/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ services:
dockerfile: boefjes/Dockerfile
args:
- ENVIRONMENT=dev
command: bash -c "python -m cProfile -o .ci/bench_$(date +%Y_%m_%d-%H:%M:%S).pstat -m pytest -v -m slow tests/integration"
command: bash -c "python -m cProfile -o .ci/bench_$(date +%Y_%m_%d-%H:%M:%S).pstat -m pytest -v -m slow tests/integration/test_bench.py::test_migration"
depends_on:
- ci_bytes
- ci_octopoes
Expand Down
11 changes: 9 additions & 2 deletions boefjes/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,21 @@ itest: ## Run the integration tests.
$(ci-docker-compose) build
$(ci-docker-compose) down --remove-orphans
$(ci-docker-compose) run --rm katalogus_integration
$(ci-docker-compose) down
$(ci-docker-compose) stop

bench: ## Run the report benchmark.
migration_bench: ## Run the migration benchmark.
$(ci-docker-compose) build
$(ci-docker-compose) down --remove-orphans
$(ci-docker-compose) run --rm migration_bench
$(ci-docker-compose) stop

bench: ## Run the other benchmarks
$(ci-docker-compose) build
$(ci-docker-compose) down --remove-orphans
$(ci-docker-compose) run --rm katalogus_integration \
python -m cProfile -o .ci/bench_$$(date +%Y_%m_%d-%H:%M:%S).pstat -m pytest -m slow --no-cov tests/integration
$(ci-docker-compose) stop

debian12:
docker run --rm \
--env PKG_NAME=kat-boefjes \
Expand Down
17 changes: 8 additions & 9 deletions boefjes/boefjes/dependencies/plugins.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import contextlib
from collections.abc import Iterator
from pathlib import Path
from typing import Literal
Expand All @@ -17,7 +16,6 @@
from boefjes.storage.interfaces import (
ConfigStorage,
DuplicatePlugin,
NotFound,
PluginNotFound,
PluginStorage,
SettingsNotConformingToSchema,
Expand Down Expand Up @@ -45,8 +43,15 @@ def __exit__(self, exc_type, exc_val, exc_tb):

def get_all(self, organisation_id: str) -> list[PluginType]:
all_plugins = self._get_all_without_enabled()
plugin_states = self.config_storage.get_state_by_id(organisation_id)

return [self._set_plugin_enabled(plugin, organisation_id) for plugin in all_plugins.values()]
for plugin in all_plugins.values():
if plugin.id not in plugin_states:
continue

plugin.enabled = plugin_states[plugin.id]

return list(all_plugins.values())

def _get_all_without_enabled(self) -> dict[str, PluginType]:
all_plugins = {plugin.id: plugin for plugin in self.plugin_storage.get_all()}
Expand Down Expand Up @@ -217,12 +222,6 @@ def _assert_settings_match_schema(self, all_settings: dict, plugin_id: str, orga
except ValidationError as e:
raise SettingsNotConformingToSchema(plugin_id, e.message) from e

def _set_plugin_enabled(self, plugin: PluginType, organisation_id: str) -> PluginType:
with contextlib.suppress(KeyError, NotFound):
plugin.enabled = self.config_storage.is_enabled_by_id(plugin.id, organisation_id)

return plugin


def get_plugin_service() -> Iterator[PluginService]:
def closure(session: Session):
Expand Down
16 changes: 1 addition & 15 deletions boefjes/boefjes/katalogus/organisations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,11 @@
from boefjes.models import Organisation
from boefjes.sql.db import ObjectNotFoundException
from boefjes.sql.organisation_storage import get_organisations_store
from boefjes.storage.interfaces import OrganisationNotFound, OrganisationStorage
from boefjes.storage.interfaces import OrganisationStorage

router = APIRouter(prefix="/organisations", tags=["organisations"])


def check_organisation_exists(
organisation_id: str, storage: OrganisationStorage = Depends(get_organisations_store)
) -> None:
"""
Checks if an organisation exists, if not, creates it.
"""
with storage as store:
try:
store.get_by_id(organisation_id)
except OrganisationNotFound:
add_organisation(Organisation(id=organisation_id, name=organisation_id), storage)
storage.get_by_id(organisation_id)


@router.get("", response_model=dict[str, Organisation])
def list_organisations(storage: OrganisationStorage = Depends(get_organisations_store)):
return storage.get_all()
Expand Down
36 changes: 16 additions & 20 deletions boefjes/boefjes/katalogus/plugins.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import datetime
from functools import partial

import structlog
from croniter import croniter
Expand All @@ -15,14 +14,11 @@
get_plugin_service,
get_plugins_filter_parameters,
)
from boefjes.katalogus.organisations import check_organisation_exists
from boefjes.models import FilterParameters, PaginationParameters, PluginType
from boefjes.sql.plugin_storage import get_plugin_storage
from boefjes.storage.interfaces import DuplicatePlugin, IntegrityError, NotAllowed, PluginStorage

router = APIRouter(
prefix="/organisations/{organisation_id}", tags=["plugins"], dependencies=[Depends(check_organisation_exists)]
)
router = APIRouter(prefix="/organisations/{organisation_id}", tags=["plugins"])

logger = structlog.get_logger(__name__)

Expand All @@ -44,33 +40,34 @@ def list_plugins(
pagination_params: PaginationParameters = Depends(get_pagination_parameters),
plugin_service: PluginService = Depends(get_plugin_service),
) -> list[PluginType]:
with plugin_service as p:
if filter_params.ids:
try:
plugins = p.by_plugin_ids(filter_params.ids, organisation_id)
except KeyError:
raise HTTPException(status.HTTP_404_NOT_FOUND, "Plugin not found")
else:
plugins = p.get_all(organisation_id)
if filter_params.ids:
try:
plugins = plugin_service.by_plugin_ids(filter_params.ids, organisation_id)
except KeyError:
raise HTTPException(status.HTTP_404_NOT_FOUND, "Plugin not found")
else:
plugins = plugin_service.get_all(organisation_id)

# filter plugins by id, name or description
if filter_params.q is not None:
plugins = filter(partial(_plugin_matches_query, query=filter_params.q), plugins)
plugins = [plugin for plugin in plugins if _plugin_matches_query(plugin, filter_params.q)]

# filter plugins by type
if filter_params.type is not None:
plugins = filter(lambda plugin: plugin.type == filter_params.type, plugins)
plugins = [plugin for plugin in plugins if plugin.type == filter_params.type]

# filter plugins by state
if filter_params.state is not None:
plugins = filter(lambda x: x.enabled is filter_params.state, plugins)
plugins = [plugin for plugin in plugins if plugin.enabled is filter_params.state]

# filter plugins by oci_image
if filter_params.oci_image is not None:
plugins = filter(lambda x: x.type == "boefje" and x.oci_image == filter_params.oci_image, plugins)
plugins = [
plugin for plugin in plugins if plugin.type == "boefje" and plugin.oci_image == filter_params.oci_image
]

# filter plugins by scan level for boefje plugins
plugins = list(filter(lambda x: x.type != "boefje" or x.scan_level >= filter_params.scan_level, plugins))
plugins = [plugin for plugin in plugins if plugin.type != "boefje" or plugin.scan_level >= filter_params.scan_level]

if pagination_params.limit is None:
return plugins[pagination_params.offset :]
Expand All @@ -84,8 +81,7 @@ def get_plugin(
plugin_id: str, organisation_id: str, plugin_service: PluginService = Depends(get_plugin_service)
) -> PluginType:
try:
with plugin_service as p:
return p.by_plugin_id(plugin_id, organisation_id)
return plugin_service.by_plugin_id(plugin_id, organisation_id)
except KeyError:
raise HTTPException(status.HTTP_404_NOT_FOUND, "Plugin not found")

Expand Down
10 changes: 2 additions & 8 deletions boefjes/boefjes/katalogus/settings.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,13 @@
from fastapi import APIRouter, Depends

from boefjes.dependencies.plugins import PluginService, get_plugin_service
from boefjes.katalogus.organisations import check_organisation_exists

router = APIRouter(
prefix="/organisations/{organisation_id}/{plugin_id}/settings",
tags=["settings"],
dependencies=[Depends(check_organisation_exists)],
)
router = APIRouter(prefix="/organisations/{organisation_id}/{plugin_id}/settings", tags=["settings"])


@router.get("", response_model=dict)
def list_settings(organisation_id: str, plugin_id: str, plugin_service: PluginService = Depends(get_plugin_service)):
with plugin_service as p:
return p.get_all_settings(organisation_id, plugin_id)
return plugin_service.get_all_settings(organisation_id, plugin_id)


@router.put("")
Expand Down
117 changes: 65 additions & 52 deletions boefjes/boefjes/local_repository.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import pkgutil
from functools import cache, lru_cache
from pathlib import Path
from typing import Any

import structlog

Expand All @@ -15,6 +15,7 @@
BoefjeResource,
ModuleException,
NormalizerResource,
hash_path,
)

logger = structlog.get_logger(__name__)
Expand All @@ -23,16 +24,11 @@
class LocalPluginRepository:
def __init__(self, path: Path):
self.path = path
self._cached_boefjes: dict[str, Any] | None = None
self._cached_normalizers: dict[str, Any] | None = None

def get_all(self) -> list[PluginType]:
all_plugins = [boefje_resource.boefje for boefje_resource in self.resolve_boefjes().values()]
normalizers = [normalizer_resource.normalizer for normalizer_resource in self.resolve_normalizers().values()]

all_plugins += normalizers

return all_plugins
boefjes = [resource.boefje for resource in self.resolve_boefjes().values()]
normalizers = [resource.normalizer for resource in self.resolve_normalizers().values()]
return boefjes + normalizers

def by_id(self, plugin_id: str) -> PluginType:
boefjes = self.resolve_boefjes()
Expand Down Expand Up @@ -107,66 +103,83 @@ def description_path(self, id_: str) -> Path | None:
return boefjes[id_].path / "description.md"

def resolve_boefjes(self) -> dict[str, BoefjeResource]:
if self._cached_boefjes:
return self._cached_boefjes
return _cached_resolve_boefjes(self.path)

def resolve_normalizers(self) -> dict[str, NormalizerResource]:
return _cached_resolve_normalizers(self.path)

paths_and_packages = self._find_packages_in_path_containing_files([BOEFJE_DEFINITION_FILE])
boefje_resources = []

for path, package in paths_and_packages:
try:
boefje_resources.append(BoefjeResource(path, package))
except ModuleException as exc:
logger.exception(exc)
@cache
def _cached_resolve_boefjes(path: Path) -> dict[str, BoefjeResource]:
paths_and_packages = _find_packages_in_path_containing_files(path, (BOEFJE_DEFINITION_FILE,))
boefje_resources = []

self._cached_boefjes = {resource.boefje.id: resource for resource in boefje_resources}
for path, package in paths_and_packages:
try:
boefje_resources.append(get_boefje_resource(path, package, hash_path(path)))
except ModuleException as exc:
logger.exception(exc)

return self._cached_boefjes
return {resource.boefje.id: resource for resource in boefje_resources}

def resolve_normalizers(self) -> dict[str, NormalizerResource]:
if self._cached_normalizers:
return self._cached_normalizers

paths_and_packages = self._find_packages_in_path_containing_files(
[NORMALIZER_DEFINITION_FILE, ENTRYPOINT_NORMALIZERS]
)
normalizer_resources = []
@cache
def _cached_resolve_normalizers(path: Path) -> dict[str, NormalizerResource]:
paths_and_packages = _find_packages_in_path_containing_files(
path, (NORMALIZER_DEFINITION_FILE, ENTRYPOINT_NORMALIZERS)
)
normalizer_resources = []

for path, package in paths_and_packages:
try:
normalizer_resources.append(get_normalizer_resource(path, package, hash(path)))
except ModuleException as exc:
logger.exception(exc)

return {resource.normalizer.id: resource for resource in normalizer_resources}


def _find_packages_in_path_containing_files(path: Path, required_files: tuple[str, ...]) -> list[tuple[Path, str]]:
prefix = create_relative_import_statement_from_cwd(path)
paths = []

for package in pkgutil.walk_packages([str(path)], prefix):
if not package.ispkg:
logger.debug("%s is not a package", package.name)
continue

new_path = path / package.name.replace(prefix, "").replace(".", "/")
missing_files = [file for file in required_files if not (new_path / file).exists()]

if missing_files:
logger.debug("Files %s not found for %s", missing_files, package.name)
continue

for path, package in paths_and_packages:
try:
normalizer_resources.append(NormalizerResource(path, package))
except ModuleException as exc:
logger.exception(exc)
paths.append((new_path, package.name))

self._cached_normalizers = {resource.normalizer.id: resource for resource in normalizer_resources}
return paths

return self._cached_normalizers

def _find_packages_in_path_containing_files(self, required_files: list[str]) -> list[tuple[Path, str]]:
prefix = self.create_relative_import_statement_from_cwd(self.path)
paths = []
def create_relative_import_statement_from_cwd(package_dir: Path) -> str:
relative_path = str(package_dir.absolute()).replace(str(Path.cwd()), "") # e.g. "/boefjes/plugins"

for package in pkgutil.walk_packages([str(self.path)], prefix):
if not package.ispkg:
logger.debug("%s is not a package", package.name)
continue
return f"{relative_path[1:].replace('/', '.')}." # Turns into "boefjes.plugins."

path = self.path / package.name.replace(prefix, "").replace(".", "/")
missing_files = [file for file in required_files if not (path / file).exists()]

if missing_files:
logger.debug("Files %s not found for %s", missing_files, package.name)
continue
@lru_cache(maxsize=200)
def get_boefje_resource(path: Path, package: str, path_hash: str):
"""The cache size in theory only has to be the amount of local boefjes available, but 200 gives us some extra
space. Adding the hash to the arguments makes sure we refresh this."""

paths.append((path, package.name))
return BoefjeResource(path, package, path_hash)

return paths

@staticmethod
def create_relative_import_statement_from_cwd(package_dir: Path) -> str:
relative_path = str(package_dir.absolute()).replace(str(Path.cwd()), "") # e.g. "/boefjes/plugins"
@lru_cache(maxsize=200)
def get_normalizer_resource(path: Path, package: str, path_hash: str):
"""The cache size in theory only has to be the amount of local normalizers available, but 200 gives us some extra
space. Adding the hash to the arguments makes sure we refresh this."""

return f"{relative_path[1:].replace('/', '.')}." # Turns into "boefjes.plugins."
return NormalizerResource(path, package)


def get_local_repository():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def upgrade() -> None:
plugins = local_repo.get_all()
logger.info("Found %s plugins", len(plugins))

for plugin in local_repo.get_all():
for plugin in plugins:
schema = local_repo.schema(plugin.id)
if schema:
query = text("UPDATE boefje SET schema = :schema WHERE plugin_id = :plugin_id") # noqa: S608
Expand Down
6 changes: 3 additions & 3 deletions boefjes/boefjes/plugins/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@ def get_runnable_module_from_package(package: str, module_file: str, *, paramete
class BoefjeResource:
"""Represents a Boefje package that we can run. Throws a ModuleException if any validation fails."""

def __init__(self, path: Path, package: str):
def __init__(self, path: Path, package: str, path_hash: str):
self.path = path
self.boefje: Boefje = Boefje.model_validate_json(path.joinpath(BOEFJE_DEFINITION_FILE).read_text())
self.boefje.runnable_hash = get_runnable_hash(self.path)
self.boefje.runnable_hash = path_hash
self.boefje.produces = self.boefje.produces.union(set(_default_mime_types(self.boefje)))
self.module: Runnable | None = None

Expand All @@ -86,7 +86,7 @@ def __init__(self, path: Path, package: str):
self.module = get_runnable_module_from_package(package, ENTRYPOINT_NORMALIZERS, parameter_count=2)


def get_runnable_hash(path: Path) -> str:
def hash_path(path: Path) -> str:
"""Returns sha256(file1 + file2 + ...) of all files in the given path."""

folder_hash = hashlib.sha256()
Expand Down
Loading

0 comments on commit 59df219

Please sign in to comment.