From 094090269adae2b710513ce1ba25126aaace2f4c Mon Sep 17 00:00:00 2001 From: Vladimir Filonov Date: Wed, 25 Dec 2024 15:40:03 +0400 Subject: [PATCH 01/21] feat: multiple alerts matching rules PoC (#2727) --- .../CorrelationSidebar/CorrelationForm.tsx | 30 ++- .../CorrelationSidebarBody.tsx | 2 + .../(keep)/rules/CorrelationSidebar/index.tsx | 1 + .../(keep)/rules/CorrelationSidebar/types.ts | 1 + keep-ui/app/(keep)/rules/CorrelationTable.tsx | 1 + keep-ui/utils/hooks/useRules.ts | 1 + keep/api/bl/incidents_bl.py | 30 +-- keep/api/core/db.py | 100 +++++---- keep/api/core/demo_mode.py | 6 + keep/api/models/db/alert.py | 4 + .../versions/2024-12-10-19-11_7297ae99cd21.py | 31 +++ keep/api/models/db/rule.py | 7 + keep/api/routes/rules.py | 13 +- keep/rulesengine/rulesengine.py | 189 ++++++++++++++---- scripts/simulate_alerts.py | 10 +- tests/test_rules_api.py | 1 + tests/test_rules_engine.py | 163 ++++++++++++++- 17 files changed, 479 insertions(+), 111 deletions(-) create mode 100644 keep/api/models/db/migrations/versions/2024-12-10-19-11_7297ae99cd21.py diff --git a/keep-ui/app/(keep)/rules/CorrelationSidebar/CorrelationForm.tsx b/keep-ui/app/(keep)/rules/CorrelationSidebar/CorrelationForm.tsx index 6f56dc6df..5490c7e71 100644 --- a/keep-ui/app/(keep)/rules/CorrelationSidebar/CorrelationForm.tsx +++ b/keep-ui/app/(keep)/rules/CorrelationSidebar/CorrelationForm.tsx @@ -87,7 +87,7 @@ export const CorrelationForm = ({ /> -
+
-
+
( + render={({field: {value, onChange}}) => ( { groupedAttributes: selectedRule.grouping_criteria, requireApprove: selectedRule.require_approve, resolveOn: selectedRule.resolve_on, + createOn: selectedRule.create_on, query: queryInGroup, incidents: selectedRule.incidents, }; diff --git a/keep-ui/utils/hooks/useRules.ts b/keep-ui/utils/hooks/useRules.ts index 957036614..e78f3ed54 100644 --- a/keep-ui/utils/hooks/useRules.ts +++ b/keep-ui/utils/hooks/useRules.ts @@ -18,6 +18,7 @@ export type Rule = { update_time: string | null; require_approve: boolean; resolve_on: "all" | "first" | "last" | "never"; + create_on: "any" | "all"; distribution: { [group: string]: { [timestamp: string]: number } }; incidents: number; }; diff --git a/keep/api/bl/incidents_bl.py b/keep/api/bl/incidents_bl.py index a1b91a8fe..7feb07bb6 100644 --- a/keep/api/bl/incidents_bl.py +++ b/keep/api/bl/incidents_bl.py @@ -81,12 +81,12 @@ def create_incident( "Incident DTO created", extra={"incident_id": new_incident_dto.id, "tenant_id": self.tenant_id}, ) - self.__update_client_on_incident_change() + self.update_client_on_incident_change() self.logger.info( "Client updated on incident change", extra={"incident_id": new_incident_dto.id, "tenant_id": self.tenant_id}, ) - self.__run_workflows(new_incident_dto, "created") + self.send_workflow_event(new_incident_dto, "created") self.logger.info( "Workflows run on incident", extra={"incident_id": new_incident_dto.id, "tenant_id": self.tenant_id}, @@ -134,7 +134,7 @@ def __update_elastic(self, alert_fingerprints: List[str]): self.logger.exception("Failed to push alert to elasticsearch") raise - def __update_client_on_incident_change(self, incident_id: Optional[UUID] = None): + def update_client_on_incident_change(self, incident_id: Optional[UUID] = None): if self.pusher_client is not None: self.logger.info( "Pushing incident change to client", @@ -150,7 +150,7 @@ def __update_client_on_incident_change(self, incident_id: Optional[UUID] = None) extra={"incident_id": incident_id, "tenant_id": self.tenant_id}, ) - def __run_workflows(self, incident_dto: IncidentDto, action: str): + def send_workflow_event(self, incident_dto: IncidentDto, action: str) -> None: try: workflow_manager = WorkflowManager.get_instance() workflow_manager.insert_incident(self.tenant_id, incident_dto, action) @@ -227,17 +227,9 @@ def delete_incident(self, incident_id: UUID) -> None: ) if not deleted: raise HTTPException(status_code=404, detail="Incident not found") - self.__update_client_on_incident_change() - try: - workflow_manager = WorkflowManager.get_instance() - self.logger.info("Adding incident to the workflow manager queue") - workflow_manager.insert_incident(self.tenant_id, incident_dto, "deleted") - self.logger.info("Added incident to the workflow manager queue") - except Exception: - self.logger.exception( - "Failed to run workflows based on incident", - extra={"incident_id": incident_dto.id, "tenant_id": self.tenant_id}, - ) + + self.update_client_on_incident_change() + self.send_workflow_event(incident_dto, "deleted") def update_incident( self, @@ -260,12 +252,12 @@ def update_incident( new_incident_dto = IncidentDto.from_db_incident(incident) - self.__update_client_on_incident_change(incident.id) + self.update_client_on_incident_change(incident.id) self.logger.info( "Client updated on incident change", extra={"incident_id": incident.id}, ) - self.__run_workflows(new_incident_dto, "updated") + self.send_workflow_event(new_incident_dto, "updated") self.logger.info( "Workflows run on incident", extra={"incident_id": incident.id}, @@ -279,13 +271,13 @@ def __postprocess_alerts_change(self, incident, alert_fingerprints): "Alerts pushed to elastic", extra={"incident_id": incident.id, "alert_fingerprints": alert_fingerprints}, ) - self.__update_client_on_incident_change(incident.id) + self.update_client_on_incident_change(incident.id) self.logger.info( "Client updated on incident change", extra={"incident_id": incident.id, "alert_fingerprints": alert_fingerprints}, ) incident_dto = IncidentDto.from_db_incident(incident) - self.__run_workflows(incident_dto, "updated") + self.send_workflow_event(incident_dto, "updated") self.logger.info( "Workflows run on incident", extra={"incident_id": incident.id, "alert_fingerprints": alert_fingerprints}, diff --git a/keep/api/core/db.py b/keep/api/core/db.py index 93a0f08cc..158972d6a 100644 --- a/keep/api/core/db.py +++ b/keep/api/core/db.py @@ -1661,6 +1661,7 @@ def create_rule( group_description=None, require_approve=False, resolve_on=ResolveOn.NEVER.value, + create_on=CreateIncidentOn.ANY.value, ): grouping_criteria = grouping_criteria or [] with Session(engine) as session: @@ -1677,6 +1678,7 @@ def create_rule( group_description=group_description, require_approve=require_approve, resolve_on=resolve_on, + create_on=create_on, ) session.add(rule) session.commit() @@ -1696,6 +1698,7 @@ def update_rule( grouping_criteria, require_approve, resolve_on, + create_on, ): rule_uuid = __convert_to_uuid(rule_id) if not rule_uuid: @@ -1717,6 +1720,7 @@ def update_rule( rule.updated_by = updated_by rule.update_time = datetime.utcnow() rule.resolve_on = resolve_on + rule.create_on = create_on session.commit() session.refresh(rule) return rule @@ -1771,8 +1775,8 @@ def delete_rule(tenant_id, rule_id): def get_incident_for_grouping_rule( - tenant_id, rule, timeframe, rule_fingerprint, session: Optional[Session] = None -) -> Incident: + tenant_id, rule, rule_fingerprint, session: Optional[Session] = None +) -> Optional[Incident]: # checks if incident with the incident criteria exists, if not it creates it # and then assign the alert to the incident with existed_or_new_session(session) as session: @@ -1791,26 +1795,34 @@ def get_incident_for_grouping_rule( enrich_incidents_with_alerts(tenant_id, [incident], session) is_incident_expired = max( alert.timestamp for alert in incident._alerts - ) < datetime.utcnow() - timedelta(seconds=timeframe) + ) < datetime.utcnow() - timedelta(seconds=rule.timeframe) # if there is no incident with the rule_fingerprint, create it or existed is already expired if not incident or is_incident_expired: - # Create and add a new incident if it doesn't exist - incident = Incident( - tenant_id=tenant_id, - user_generated_name=f"{rule.name}", - rule_id=rule.id, - rule_fingerprint=rule_fingerprint, - is_predicted=False, - is_confirmed=not rule.require_approve, - ) - session.add(incident) - session.commit() - session.refresh(incident) + return None return incident +def create_incident_for_grouping_rule( + tenant_id, rule, rule_fingerprint, session: Optional[Session] = None +): + + with existed_or_new_session(session) as session: + # Create and add a new incident if it doesn't exist + incident = Incident( + tenant_id=tenant_id, + user_generated_name=f"{rule.name}", + rule_id=rule.id, + rule_fingerprint=rule_fingerprint, + is_predicted=False, + is_confirmed=rule.create_on == CreateIncidentOn.ANY.value and not rule.require_approve, + ) + session.add(incident) + session.commit() + session.refresh(incident) + return incident + def get_rule(tenant_id, rule_id): with Session(engine) as session: rule = session.exec( @@ -3420,7 +3432,7 @@ def create_incident_from_dict( def update_incident_from_dto_by_id( tenant_id: str, - incident_id: str, + incident_id: str | UUID, updated_incident_dto: IncidentDtoIn | IncidentDto, generated_by_ai: bool = False, ) -> Optional[Incident]: @@ -3803,7 +3815,7 @@ def add_alerts_to_incident( return incident -def get_incident_unique_fingerprint_count(tenant_id: str, incident_id: str) -> int: +def get_incident_unique_fingerprint_count(tenant_id: str, incident_id: str | UUID) -> int: with Session(engine) as session: return session.execute( select(func.count(1)) @@ -4476,12 +4488,22 @@ def get_workflow_executions_for_incident_or_alert( results = session.execute(final_query).all() return results, total_count +def is_all_alerts_resolved( + fingerprints: Optional[List[str]] = None, + incident: Optional[Incident] = None, + session: Optional[Session] = None +): + return is_all_alerts_in_status(fingerprints, incident, AlertStatus.RESOLVED, session) -def is_all_incident_alerts_resolved( - incident: Incident, session: Optional[Session] = None -) -> bool: - if incident.alerts_count == 0: +def is_all_alerts_in_status( + fingerprints: Optional[List[str]] = None, + incident: Optional[Incident] = None, + status: AlertStatus = AlertStatus.RESOLVED, + session: Optional[Session] = None +): + + if incident and incident.alerts_count == 0: return False with existed_or_new_session(session) as session: @@ -4505,20 +4527,30 @@ def is_all_incident_alerts_resolved( Alert.fingerprint == AlertEnrichment.alert_fingerprint, ), ) - .join( + ) + + if fingerprints: + subquery = subquery.where(LastAlert.fingerprint.in_(fingerprints)) + + if incident: + subquery = ( + subquery + .join( LastAlertToIncident, and_( LastAlertToIncident.tenant_id == LastAlert.tenant_id, LastAlertToIncident.fingerprint == LastAlert.fingerprint, ), ) - .where( - LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, - LastAlertToIncident.incident_id == incident.id, + .where( + LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT, + LastAlertToIncident.incident_id == incident.id, + ) ) - ).subquery() - not_resolved_exists = session.query( + subquery = subquery.subquery() + + not_in_status_exists = session.query( exists( select( subquery.c.enriched_status, @@ -4527,17 +4559,17 @@ def is_all_incident_alerts_resolved( .select_from(subquery) .where( or_( - subquery.c.enriched_status != AlertStatus.RESOLVED.value, + subquery.c.enriched_status != status.value, and_( subquery.c.enriched_status.is_(None), - subquery.c.status != AlertStatus.RESOLVED.value, + subquery.c.status != status.value, ), ) ) ) ).scalar() - return not not_resolved_exists + return not not_in_status_exists def is_last_incident_alert_resolved( @@ -4888,11 +4920,11 @@ def set_last_alert( timestamp=alert.timestamp, first_timestamp=alert.timestamp, alert_id=alert.id, - alert_hash=alert.alert_hash, - ) + alert_hash=alert.alert_hash, + ) - session.add(last_alert) - session.commit() + session.add(last_alert) + session.commit() except OperationalError as ex: if "no such savepoint" in ex.args[0]: logger.info( diff --git a/keep/api/core/demo_mode.py b/keep/api/core/demo_mode.py index 17f4b8b28..e18ea90db 100644 --- a/keep/api/core/demo_mode.py +++ b/keep/api/core/demo_mode.py @@ -432,6 +432,7 @@ async def simulate_alerts_async( demo_topology=False, clean_old_incidents=False, demo_ai=False, + count=None, target_rps=0, ): logger.info("Simulating alerts...") @@ -487,6 +488,11 @@ async def simulate_alerts_async( shoot = 1 while True: + if count is not None: + count -= 1 + if count < 0: + break + try: logger.info("Looping to send alerts...") diff --git a/keep/api/models/db/alert.py b/keep/api/models/db/alert.py index 31d83fe47..1d9bbeb52 100644 --- a/keep/api/models/db/alert.py +++ b/keep/api/models/db/alert.py @@ -208,6 +208,10 @@ class Incident(SQLModel, table=True): class Config: arbitrary_types_allowed = True + @property + def alerts(self): + return self._alerts + class Alert(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) diff --git a/keep/api/models/db/migrations/versions/2024-12-10-19-11_7297ae99cd21.py b/keep/api/models/db/migrations/versions/2024-12-10-19-11_7297ae99cd21.py new file mode 100644 index 000000000..577acbe47 --- /dev/null +++ b/keep/api/models/db/migrations/versions/2024-12-10-19-11_7297ae99cd21.py @@ -0,0 +1,31 @@ +"""Add Rule.create_on + +Revision ID: 7297ae99cd21 +Revises: 4f8c4b185d5b +Create Date: 2024-12-10 19:11:28.512095 + +""" + +import sqlalchemy as sa +import sqlmodel +from alembic import op + +# revision identifiers, used by Alembic. +revision = "7297ae99cd21" +down_revision = "4f8c4b185d5b" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + with op.batch_alter_table("rule", schema=None) as batch_op: + batch_op.add_column( + sa.Column("create_on", sqlmodel.sql.sqltypes.AutoString(), nullable=False, default="any", server_default="any") + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + with op.batch_alter_table("rule", schema=None) as batch_op: + batch_op.drop_column("create_on") diff --git a/keep/api/models/db/rule.py b/keep/api/models/db/rule.py index 6ba47ea62..78eb886da 100644 --- a/keep/api/models/db/rule.py +++ b/keep/api/models/db/rule.py @@ -14,6 +14,12 @@ class ResolveOn(Enum): ALL = "all_resolved" NEVER = "never" + +class CreateIncidentOn(Enum): + # the alert was triggered + ANY = "any" + ALL = "all" + # TODOs/Pitfalls down the road which we hopefully need to address in the future: # 1. nested attibtues (event.foo.bar = 1) # 2. scale - when event arrives, we need to check if the rule is applicable to the event @@ -41,3 +47,4 @@ class Rule(SQLModel, table=True): item_description: str = None require_approve: bool = False resolve_on: str = ResolveOn.NEVER.value + create_on: str = CreateIncidentOn.ANY.value diff --git a/keep/api/routes/rules.py b/keep/api/routes/rules.py index ccb8f4fa2..ee09992e8 100644 --- a/keep/api/routes/rules.py +++ b/keep/api/routes/rules.py @@ -8,7 +8,7 @@ from keep.api.core.db import get_rule_distribution as get_rule_distribution_db from keep.api.core.db import get_rules as get_rules_db from keep.api.core.db import update_rule as update_rule_db -from keep.api.models.db.rule import ResolveOn +from keep.api.models.db.rule import ResolveOn, CreateIncidentOn from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory @@ -27,6 +27,7 @@ class RuleCreateDto(BaseModel): groupDescription: str = None requireApprove: bool = False resolveOn: str = ResolveOn.NEVER.value + createOn: str = CreateIncidentOn.ANY.value @router.get( @@ -75,6 +76,7 @@ async def create_rule( group_description = rule_create_request.groupDescription require_approve = rule_create_request.requireApprove resolve_on = rule_create_request.resolveOn + create_on = rule_create_request.createOn sql = rule_create_request.sqlQuery.get("sql") params = rule_create_request.sqlQuery.get("params") @@ -99,6 +101,9 @@ async def create_rule( if not resolve_on: raise HTTPException(status_code=400, detail="resolveOn is required") + if not create_on: + raise HTTPException(status_code=400, detail="createOn is required") + rule = create_rule_db( tenant_id=tenant_id, name=rule_name, @@ -114,6 +119,7 @@ async def create_rule( group_description=group_description, require_approve=require_approve, resolve_on=resolve_on, + create_on=create_on, ) logger.info("Rule created") return rule @@ -162,6 +168,7 @@ async def update_rule( timeframe = body["timeframeInSeconds"] timeunit = body["timeUnit"] resolve_on = body["resolveOn"] + create_on = body["createOn"] grouping_criteria = body.get("groupingCriteria", []) require_approve = body.get("requireApprove", []) except Exception: @@ -191,6 +198,9 @@ async def update_rule( if not resolve_on: raise HTTPException(status_code=400, detail="resolveOn is required") + if not create_on: + raise HTTPException(status_code=400, detail="createOn is required") + rule = update_rule_db( tenant_id=tenant_id, rule_id=rule_id, @@ -206,6 +216,7 @@ async def update_rule( grouping_criteria=grouping_criteria, require_approve=require_approve, resolve_on=resolve_on, + create_on=create_on, ) if rule: diff --git a/keep/rulesengine/rulesengine.py b/keep/rulesengine/rulesengine.py index 03538b8e3..31f77b09e 100644 --- a/keep/rulesengine/rulesengine.py +++ b/keep/rulesengine/rulesengine.py @@ -1,6 +1,6 @@ import json import logging -from typing import Optional +from typing import Optional, List import celpy import celpy.c7nlib @@ -9,15 +9,21 @@ import celpy.evaluation from sqlmodel import Session -from keep.api.core.db import assign_alert_to_incident, get_incident_for_grouping_rule -from keep.api.core.db import get_rules as get_rules_db +from keep.api.bl.incidents_bl import IncidentBl from keep.api.core.db import ( - is_all_incident_alerts_resolved, + assign_alert_to_incident, + get_incident_for_grouping_rule, + create_incident_for_grouping_rule, + is_all_alerts_resolved, is_first_incident_alert_resolved, is_last_incident_alert_resolved, + is_all_alerts_in_status, enrich_incidents_with_alerts, ) -from keep.api.models.alert import AlertDto, AlertSeverity, IncidentDto, IncidentStatus -from keep.api.models.db.rule import ResolveOn +from keep.api.core.db import get_rules as get_rules_db +from keep.api.core.dependencies import get_pusher_client +from keep.api.models.alert import AlertDto, AlertSeverity, IncidentDto, IncidentStatus, AlertStatus +from keep.api.models.db.alert import Incident +from keep.api.models.db.rule import ResolveOn, Rule from keep.api.utils.cel_utils import preprocess_cel_expression # Shahar: this is performance enhancment https://github.com/cloud-custodian/cel-python/issues/68 @@ -59,27 +65,27 @@ def run_rules( f"Checking if rule {rule.name} apply to event {event.id}" ) try: - rule_result = self._check_if_rule_apply(rule, event) + matched_rules = self._check_if_rule_apply(rule, event) except Exception: self.logger.exception( f"Failed to evaluate rule {rule.name} on event {event.id}" ) continue - if rule_result: + + if matched_rules: self.logger.info( f"Rule {rule.name} on event {event.id} is relevant" ) + send_created_event = False + rule_fingerprint = self._calc_rule_fingerprint(event, rule) - incident = get_incident_for_grouping_rule( - self.tenant_id, + incident = self._get_or_create_incident( rule, - rule.timeframe, rule_fingerprint, - session=session, + session, ) - incident = assign_alert_to_incident( fingerprint=event.fingerprint, incident=incident, @@ -87,36 +93,41 @@ def run_rules( session=session, ) - should_resolve = False + if not incident.is_confirmed: + + self.logger.info( + f"No existing incidents for rule {rule.name}. Checking incident creation conditions" + ) + + rule_groups = self._extract_subrules(rule.definition_cel) + + if rule.create_on == "any" or (rule.create_on == "all" and len(rule_groups) == len(matched_rules)): + self.logger.info("Single event is enough, so creating incident") + incident.is_confirmed = True + elif rule.create_on == "all": + incident = self._process_event_for_history_based_rule( + incident, rule, session + ) - if ( - rule.resolve_on == ResolveOn.ALL.value - and is_all_incident_alerts_resolved(incident, session=session) - ): - should_resolve = True + send_created_event = incident.is_confirmed - elif ( - rule.resolve_on == ResolveOn.FIRST.value - and is_first_incident_alert_resolved(incident, session=session) - ): - should_resolve = True + incident = self._resolve_incident_if_require(rule, incident, session) + session.add(incident) + session.commit() - elif ( - rule.resolve_on == ResolveOn.LAST.value - and is_last_incident_alert_resolved(incident, session=session) - ): - should_resolve = True + incident_dto = IncidentDto.from_db_incident(incident) + if send_created_event: + self._send_workflow_event(session, incident_dto, "created") + elif incident.is_confirmed: + self._send_workflow_event(session, incident_dto, "updated") - if should_resolve: - incident.status = IncidentStatus.RESOLVED.value - session.add(incident) - session.commit() + incidents_dto[incident.id] = incident_dto - incidents_dto[incident.id] = IncidentDto.from_db_incident(incident) else: self.logger.info( f"Rule {rule.name} on event {event.id} is not relevant" ) + self.logger.info("Rules ran successfully") # if we don't have any updated groups, we don't need to create any alerts if not incidents_dto: @@ -126,10 +137,94 @@ def run_rules( return list(incidents_dto.values()) - def _extract_subrules(self, expression): - # CEL rules looks like '(source == "sentry") && (source == "grafana" && severity == "critical")' + + def _get_or_create_incident(self, rule, rule_fingerprint, session): + incident = get_incident_for_grouping_rule( + self.tenant_id, + rule, + rule_fingerprint, + session=session, + ) + if not incident: + incident = create_incident_for_grouping_rule( + self.tenant_id, + rule, + rule_fingerprint, + session=session, + ) + return incident + + def _process_event_for_history_based_rule( + self, + incident: Incident, + rule: Rule, + session: Session + ) -> Incident: + self.logger.info( + "Multiple events required for the incident to start" + ) + + enrich_incidents_with_alerts( + tenant_id=self.tenant_id, + incidents=[incident], + session=session, + ) + + fingerprints = [alert.fingerprint for alert in incident.alerts] + + is_all_conditions_met = False + + all_sub_rules = set(self._extract_subrules(rule.definition_cel)) + matched_sub_rules = set() + + for alert in incident.alerts: + matched_sub_rules = matched_sub_rules.union(self._check_if_rule_apply(rule, AlertDto(**alert.event))) + if all_sub_rules == matched_sub_rules: + is_all_conditions_met = True + break + + if is_all_conditions_met: + all_alerts_firing = is_all_alerts_in_status( + fingerprints=fingerprints, status=AlertStatus.FIRING, session=session + ) + if all_alerts_firing: + incident.is_confirmed = True + + return incident + + @staticmethod + def _resolve_incident_if_require(rule: Rule, incident: Incident, session: Session) -> Incident: + + should_resolve = False + + if ( + rule.resolve_on == ResolveOn.ALL.value + and is_all_alerts_resolved(incident=incident, session=session) + ): + should_resolve = True + + elif ( + rule.resolve_on == ResolveOn.FIRST.value + and is_first_incident_alert_resolved(incident, session=session) + ): + should_resolve = True + + elif ( + rule.resolve_on == ResolveOn.LAST.value + and is_last_incident_alert_resolved(incident, session=session) + ): + should_resolve = True + + if should_resolve: + incident.status = IncidentStatus.RESOLVED.value + + return incident + + @staticmethod + def _extract_subrules(expression): + # CEL rules looks like '(source == "sentry") || (source == "grafana" && severity == "critical")' # and we need to extract the subrules - sub_rules = expression.split(") && (") + sub_rules = expression.split(") || (") if len(sub_rules) == 1: return sub_rules # the first and the last rules will have a ( or ) at the beginning or the end @@ -142,7 +237,7 @@ def _extract_subrules(self, expression): return sub_rules # TODO: a lot of unit tests to write here - def _check_if_rule_apply(self, rule, event: AlertDto): + def _check_if_rule_apply(self, rule: Rule, event: AlertDto) -> List[str]: sub_rules = self._extract_subrules(rule.definition_cel) payload = event.dict() # workaround since source is a list @@ -152,6 +247,7 @@ def _check_if_rule_apply(self, rule, event: AlertDto): # what we do here is to compile the CEL rule and evaluate it # https://github.com/cloud-custodian/cel-python # https://github.com/google/cel-spec + sub_rules_matched = [] for sub_rule in sub_rules: ast = self.env.compile(sub_rule) prgm = self.env.program(ast) @@ -161,13 +257,13 @@ def _check_if_rule_apply(self, rule, event: AlertDto): except celpy.evaluation.CELEvalError as e: # this is ok, it means that the subrule is not relevant for this event if "no such member" in str(e): - return False + continue # unknown raise if r: - return True + sub_rules_matched.append(sub_rule) # no subrules matched - return False + return sub_rules_matched def _calc_rule_fingerprint(self, event: AlertDto, rule): # extract all the grouping criteria from the event @@ -213,7 +309,8 @@ def _calc_rule_fingerprint(self, event: AlertDto, rule): return "none" return ",".join(rule_fingerprint) - def get_alerts_activation(self, alerts: list[AlertDto]): + @staticmethod + def get_alerts_activation(alerts: list[AlertDto]): activations = [] for alert in alerts: payload = alert.dict() @@ -288,3 +385,11 @@ def filter_alerts( filtered_alerts.append(alert) return filtered_alerts + + def _send_workflow_event(self, session: Session, incident_dto: IncidentDto, action: str): + pusher_client = get_pusher_client() + incident_bl = IncidentBl(self.tenant_id, session, pusher_client) + + incident_bl.send_workflow_event(incident_dto, action) + incident_bl.update_client_on_incident_change(incident_dto.id) + diff --git a/scripts/simulate_alerts.py b/scripts/simulate_alerts.py index 45db1310b..4eba9086d 100644 --- a/scripts/simulate_alerts.py +++ b/scripts/simulate_alerts.py @@ -17,6 +17,13 @@ async def main(): parser = argparse.ArgumentParser(description="Simulate alerts for Keep API.") + parser.add_argument( + "--num", + action="store", + dest="num", + type=int, + help="Number of alerts to simulate." + ) parser.add_argument( "--full-demo", action="store_true", @@ -50,7 +57,8 @@ async def main(): demo_topology=args.full_demo, clean_old_incidents=args.full_demo, demo_ai=args.full_demo, - target_rps=rps + count=args.num, + target_rps=rps, ) diff --git a/tests/test_rules_api.py b/tests/test_rules_api.py index e5ffaaff3..380ec3c43 100644 --- a/tests/test_rules_api.py +++ b/tests/test_rules_api.py @@ -163,6 +163,7 @@ def test_update_rule_api(db_session, client, test_app): "timeUnit": "seconds", "requireApprove": False, "resolveOn": "all", + "createOn": "any", } response = client.put( diff --git a/tests/test_rules_engine.py b/tests/test_rules_engine.py index 07ad6cf70..b80337f3d 100644 --- a/tests/test_rules_engine.py +++ b/tests/test_rules_engine.py @@ -3,10 +3,11 @@ import json import os import uuid +from time import sleep import pytest -from keep.api.core.db import create_rule as create_rule_db +from keep.api.core.db import create_rule as create_rule_db, enrich_incidents_with_alerts from keep.api.core.db import get_incident_alerts_by_incident_id, get_last_incidents, set_last_alert from keep.api.core.db import get_rules as get_rules_db from keep.api.core.dependencies import SINGLE_TENANT_UUID @@ -17,8 +18,8 @@ IncidentSeverity, IncidentStatus, ) -from keep.api.models.db.alert import Alert -from keep.api.models.db.rule import ResolveOn +from keep.api.models.db.alert import Alert, Incident +from keep.api.models.db.rule import ResolveOn, CreateIncidentOn from keep.rulesengine.rulesengine import RulesEngine @@ -72,7 +73,7 @@ def test_sanity(db_session): set_last_alert(SINGLE_TENANT_UUID, alert, db_session) # run the rules engine alerts[0].event_id = alert.id - results = rules_engine.run_rules(alerts) + results = rules_engine.run_rules(alerts, session=db_session) # check that there are results assert len(results) > 0 @@ -120,7 +121,7 @@ def test_sanity_2(db_session): set_last_alert(SINGLE_TENANT_UUID, alert, db_session) # run the rules engine alerts[0].event_id = alert.id - results = rules_engine.run_rules(alerts) + results = rules_engine.run_rules(alerts, session=db_session) # check that there are results assert len(results) > 0 @@ -169,7 +170,7 @@ def test_sanity_3(db_session): set_last_alert(SINGLE_TENANT_UUID, alert, db_session) # run the rules engine alerts[0].event_id = alert.id - results = rules_engine.run_rules(alerts) + results = rules_engine.run_rules(alerts, session=db_session) # check that there are results assert len(results) > 0 @@ -218,7 +219,7 @@ def test_sanity_4(db_session): set_last_alert(SINGLE_TENANT_UUID, alert, db_session) # run the rules engine alerts[0].event_id = alert.id - results = rules_engine.run_rules(alerts) + results = rules_engine.run_rules(alerts, session=db_session) # check that there are results assert results == [] @@ -251,6 +252,7 @@ def test_incident_attributes(db_session): timeunit="seconds", definition_cel='(source == "grafana" && labels.label_1 == "a")', created_by="test@keephq.dev", + create_on="any" ) rules = get_rules_db(SINGLE_TENANT_UUID) assert len(rules) == 1 @@ -273,7 +275,7 @@ def test_incident_attributes(db_session): for i, alert in enumerate(alerts_dto): alert.event_id = alerts[i].id - results = rules_engine.run_rules([alert]) + results = rules_engine.run_rules([alert], session=db_session) # check that there are results assert results is not None assert len(results) == 1 @@ -336,7 +338,7 @@ def test_incident_severity(db_session): for i, alert in enumerate(alerts_dto): alert.event_id = alerts[i].id - results = rules_engine.run_rules(alerts_dto) + results = rules_engine.run_rules(alerts_dto, session=db_session) # check that there are results assert results is not None assert len(results) == 1 @@ -581,6 +583,149 @@ def test_incident_resolution_on_edge( assert incident.status == IncidentStatus.RESOLVED.value +def test_rule_multiple_alerts(db_session, create_alert): + + create_rule_db( + tenant_id=SINGLE_TENANT_UUID, + name="test-rule", + definition={ + "sql": "N/A", # we don't use it anymore + "params": {}, + }, + timeframe=600, + timeunit="seconds", + definition_cel='(severity == "critical") || (severity == "high")', + created_by="test@keephq.dev", + create_on=CreateIncidentOn.ALL.value, + ) + + create_alert( + "Critical Alert", + AlertStatus.FIRING, + datetime.datetime.utcnow(), + { + "severity": AlertSeverity.CRITICAL.value, + }, + ) + + # No incident yet + assert db_session.query(Incident).filter(Incident.is_confirmed == True).count() == 0 + # But candidate is there + assert db_session.query(Incident).filter(Incident.is_confirmed == False).count() == 1 + incident = db_session.query(Incident).first() + alert_1 = db_session.query(Alert).order_by(Alert.timestamp.desc()).first() + + enrich_incidents_with_alerts(SINGLE_TENANT_UUID, [incident], db_session) + + assert incident.alerts_count == 1 + assert len(incident.alerts) == 1 + assert incident.alerts[0].id == alert_1.id + + create_alert( + "Critical Alert 2", + AlertStatus.FIRING, + datetime.datetime.utcnow(), + { + "severity": AlertSeverity.CRITICAL.value, + }, + ) + + db_session.refresh(incident) + alert_2 = db_session.query(Alert).order_by(Alert.timestamp.desc()).first() + + # Still no incident yet + assert db_session.query(Incident).filter(Incident.is_confirmed == True).count() == 0 + # And still one candidate is there + assert db_session.query(Incident).filter(Incident.is_confirmed == False).count() == 1 + + enrich_incidents_with_alerts(SINGLE_TENANT_UUID, [incident], db_session) + + assert incident.alerts_count == 2 + assert len(incident.alerts) == 2 + assert incident.alerts[0].id == alert_1.id + assert incident.alerts[1].id == alert_2.id + + create_alert( + "High Alert", + AlertStatus.FIRING, + datetime.datetime.utcnow(), + { + "severity": AlertSeverity.HIGH.value, + }, + ) + + alert_3 = db_session.query(Alert).order_by(Alert.timestamp.desc()).first() + enrich_incidents_with_alerts(SINGLE_TENANT_UUID, [incident], db_session) + + # And incident was official started + assert db_session.query(Incident).filter(Incident.is_confirmed == True).count() == 1 + + db_session.refresh(incident) + assert incident.alerts_count == 3 + + alerts, alert_count = get_incident_alerts_by_incident_id( + tenant_id=SINGLE_TENANT_UUID, + incident_id=str(incident.id), + session=db_session, + ) + assert alert_count == 3 + assert len(alerts) == 3 + + fingerprints = [a.fingerprint for a in alerts] + + assert alert_1.fingerprint in fingerprints + assert alert_2.fingerprint in fingerprints + assert alert_3.fingerprint in fingerprints + + +def test_rule_event_groups_expires(db_session, create_alert): + + create_rule_db( + tenant_id=SINGLE_TENANT_UUID, + name="test-rule", + definition={ + "sql": "N/A", # we don't use it anymore + "params": {}, + }, + timeframe=1, + timeunit="seconds", + definition_cel='(severity == "critical") || (severity == "high")', + created_by="test@keephq.dev", + create_on=CreateIncidentOn.ALL.value, + ) + + create_alert( + "Critical Alert", + AlertStatus.FIRING, + datetime.datetime.utcnow(), + { + "severity": AlertSeverity.CRITICAL.value, + }, + ) + + # Still no incident yet + assert db_session.query(Incident).filter(Incident.is_confirmed == True).count() == 0 + # And still one candidate is there + assert db_session.query(Incident).filter(Incident.is_confirmed == False).count() == 1 + + sleep(1) + + create_alert( + "High Alert", + AlertStatus.FIRING, + datetime.datetime.utcnow(), + { + "severity": AlertSeverity.HIGH.value, + }, + ) + + # Still no incident yet + assert db_session.query(Incident).filter(Incident.is_confirmed == True).count() == 0 + # And now two candidates is there + assert db_session.query(Incident).filter(Incident.is_confirmed == False).count() == 2 + + + # Next steps: # - test that alerts in the same group are being updated correctly # - test group are being updated correctly From 55cb0e5aabf8f74493c779ae838671c1fa9b399b Mon Sep 17 00:00:00 2001 From: Kirill Chernakov Date: Wed, 25 Dec 2024 16:51:39 +0400 Subject: [PATCH 02/21] chore: workflow builder: remove leftovers, unused imports, variables, css (#2900) --- .../workflows/[workflow_id]/executions.tsx | 1 - .../builder/BuilderChanagesTracker.tsx | 55 ------------------ .../(keep)/workflows/builder/CustomEdge.tsx | 12 ++-- .../workflows/builder/ReactFlowBuilder.tsx | 8 +-- .../workflows/builder/[workflowId]/page.tsx | 6 ++ .../(keep)/workflows/builder/builder-card.tsx | 57 +++++++++--------- .../workflows/builder/builder-modal.tsx | 6 +- .../workflows/builder/builder-store.tsx | 26 --------- .../workflows/builder/builder-validators.tsx | 2 - .../builder-workflow-testrun-modal.tsx | 2 +- .../app/(keep)/workflows/builder/builder.tsx | 58 ++++++++++++------- .../{alert.tsx => legacy-workflow.types.ts} | 2 - .../app/(keep)/workflows/builder/loader.tsx | 6 +- .../(keep)/workflows/builder/page.client.tsx | 27 +++++---- keep-ui/app/(keep)/workflows/builder/page.css | 27 --------- keep-ui/app/(keep)/workflows/builder/page.tsx | 7 ++- .../app/(keep)/workflows/builder/utils.tsx | 3 +- .../app/(keep)/workflows/workflow-menu.tsx | 7 +-- 18 files changed, 108 insertions(+), 204 deletions(-) delete mode 100644 keep-ui/app/(keep)/workflows/builder/BuilderChanagesTracker.tsx rename keep-ui/app/(keep)/workflows/builder/{alert.tsx => legacy-workflow.types.ts} (91%) delete mode 100644 keep-ui/app/(keep)/workflows/builder/page.css diff --git a/keep-ui/app/(keep)/workflows/[workflow_id]/executions.tsx b/keep-ui/app/(keep)/workflows/[workflow_id]/executions.tsx index a5ac90629..eb4ae1851 100644 --- a/keep-ui/app/(keep)/workflows/[workflow_id]/executions.tsx +++ b/keep-ui/app/(keep)/workflows/[workflow_id]/executions.tsx @@ -18,7 +18,6 @@ export default function WorkflowDetailPage({ }: { params: { workflow_id: string }; }) { - const router = useRouter(); const api = useApi(); const [navlink, setNavLink] = useState("overview"); diff --git a/keep-ui/app/(keep)/workflows/builder/BuilderChanagesTracker.tsx b/keep-ui/app/(keep)/workflows/builder/BuilderChanagesTracker.tsx deleted file mode 100644 index 8ecc1feb5..000000000 --- a/keep-ui/app/(keep)/workflows/builder/BuilderChanagesTracker.tsx +++ /dev/null @@ -1,55 +0,0 @@ -import React from "react"; -import useStore from "./builder-store"; -import { Button } from "@tremor/react"; -import { reConstructWorklowToDefinition } from "utils/reactFlow"; - -export default function BuilderChanagesTracker({ - onDefinitionChange, -}: { - onDefinitionChange: (def: Record) => void; -}) { - const { - nodes, - edges, - setEdges, - setNodes, - isLayouted, - setIsLayouted, - v2Properties, - changes, - setChanges, - lastSavedChanges, - setLastSavedChanges, - } = useStore(); - const handleDiscardChanges = (e: React.MouseEvent) => { - if (!isLayouted) return; - setEdges(lastSavedChanges.edges || []); - setNodes(lastSavedChanges.nodes || []); - setChanges(0); - setIsLayouted(false); - }; - - const handleSaveChanges = (e: React.MouseEvent) => { - e.preventDefault(); - e.stopPropagation(); - setLastSavedChanges({ nodes: nodes, edges: edges }); - const value = reConstructWorklowToDefinition({ - nodes: nodes, - edges: edges, - properties: v2Properties, - }); - onDefinitionChange(value); - setChanges(0); - }; - - return ( -
- - -
- ); -} diff --git a/keep-ui/app/(keep)/workflows/builder/CustomEdge.tsx b/keep-ui/app/(keep)/workflows/builder/CustomEdge.tsx index f1d17c72e..502f3b90b 100644 --- a/keep-ui/app/(keep)/workflows/builder/CustomEdge.tsx +++ b/keep-ui/app/(keep)/workflows/builder/CustomEdge.tsx @@ -1,6 +1,6 @@ import React from "react"; import { BaseEdge, EdgeLabelRenderer, getSmoothStepPath } from "@xyflow/react"; -import type { Edge, EdgeProps } from "@xyflow/react"; +import type { EdgeProps } from "@xyflow/react"; import useStore from "./builder-store"; import { PlusIcon } from "@radix-ui/react-icons"; import { Button } from "@tremor/react"; @@ -24,7 +24,7 @@ const CustomEdge: React.FC = ({ data, style, }: CustomEdgeProps) => { - const { deleteEdges, edges, setSelectedEdge, selectedEdge } = useStore(); + const { setSelectedEdge, selectedEdge } = useStore(); // Calculate the path and midpoint const [edgePath, labelX, labelY] = getSmoothStepPath({ @@ -35,9 +35,6 @@ const CustomEdge: React.FC = ({ borderRadius: 10, }); - const midpointX = (sourceX + targetX) / 2; - const midpointY = (sourceY + targetY) / 2; - let dynamicLabel = label; const isLayouted = !!data?.isLayouted; let showAddButton = @@ -54,8 +51,9 @@ const CustomEdge: React.FC = ({ dynamicLabel === "True" ? "left-0 bg-green-500" : dynamicLabel === "False" - ? "bg-red-500" - : "bg-orange-500"; + ? "bg-red-500" + : "bg-orange-500"; + return ( <> -
+
+
{!isLoading && ( ); } + +export const metadata: Metadata = { + title: "Keep - Workflow Builder", + description: "Build workflows with a UI builder.", +}; diff --git a/keep-ui/app/(keep)/workflows/builder/builder-card.tsx b/keep-ui/app/(keep)/workflows/builder/builder-card.tsx index 851f9fd00..a26c93c25 100644 --- a/keep-ui/app/(keep)/workflows/builder/builder-card.tsx +++ b/keep-ui/app/(keep)/workflows/builder/builder-card.tsx @@ -5,7 +5,6 @@ import { useEffect, useState } from "react"; import Loader from "./loader"; import { Provider } from "../../providers/providers"; import { useProviders } from "utils/hooks/useProviders"; -import clsx from "clsx"; const Builder = dynamic(() => import("./builder"), { ssr: false, // Prevents server-side rendering @@ -53,20 +52,14 @@ export function BuilderCard({ if (!providers || isLoading) return ( - + ); - return ( - - {error ? ( + if (error) { + return ( + Failed to load providers - ) : fileContents == "" && !workflow ? ( + + ); + } + + if (fileContents == "" && !workflow) { + return ( + - ) : ( - - )} - + + ); + } + + return ( + ); } diff --git a/keep-ui/app/(keep)/workflows/builder/builder-modal.tsx b/keep-ui/app/(keep)/workflows/builder/builder-modal.tsx index 1bbfd35ae..44eb01855 100644 --- a/keep-ui/app/(keep)/workflows/builder/builder-modal.tsx +++ b/keep-ui/app/(keep)/workflows/builder/builder-modal.tsx @@ -2,7 +2,7 @@ import { XMarkIcon } from "@heroicons/react/24/outline"; import { Button, Card, Subtitle, Title } from "@tremor/react"; import { CopyBlock, a11yLight } from "react-code-blocks"; import { stringify } from "yaml"; -import { Alert } from "./alert"; +import { Alert } from "./legacy-workflow.types"; import { useState } from "react"; import ReactLoading from "react-loading"; import { ArrowDownTrayIcon } from "@heroicons/react/20/solid"; @@ -56,8 +56,8 @@ export default function BuilderModalContent({ <>
- Generated Alert - Keep alert specification ready to use + Generated Workflow YAML + Keep workflow specification ready to use
{!hideCloseButton && ( diff --git a/keep-ui/app/(keep)/workflows/builder/builder-store.tsx b/keep-ui/app/(keep)/workflows/builder/builder-store.tsx index 551242cd7..d16d3b2b5 100644 --- a/keep-ui/app/(keep)/workflows/builder/builder-store.tsx +++ b/keep-ui/app/(keep)/workflows/builder/builder-store.tsx @@ -66,32 +66,6 @@ export type FlowNode = Node & { isNested: boolean; }; -const initialNodes: Partial[] = [ - { - id: "a", - position: { x: 0, y: 0 }, - data: { label: "Node A", type: "custom" }, - type: "custom", - }, - { - id: "b", - position: { x: 0, y: 100 }, - data: { label: "Node B", type: "custom" }, - type: "custom", - }, - { - id: "c", - position: { x: 0, y: 200 }, - data: { label: "Node C", type: "custom" }, - type: "custom", - }, -]; - -const initialEdges: Edge[] = [ - { id: "a->b", type: "custom-edge", source: "a", target: "b" }, - { id: "b->c", type: "custom-edge", source: "b", target: "c" }, -]; - export type FlowState = { nodes: FlowNode[]; edges: Edge[]; diff --git a/keep-ui/app/(keep)/workflows/builder/builder-validators.tsx b/keep-ui/app/(keep)/workflows/builder/builder-validators.tsx index 600ae004a..fc04f87ea 100644 --- a/keep-ui/app/(keep)/workflows/builder/builder-validators.tsx +++ b/keep-ui/app/(keep)/workflows/builder/builder-validators.tsx @@ -1,5 +1,3 @@ -import { Dispatch, SetStateAction } from "react"; - import { Definition as FlowDefinition, ReactFlowDefinition, diff --git a/keep-ui/app/(keep)/workflows/builder/builder-workflow-testrun-modal.tsx b/keep-ui/app/(keep)/workflows/builder/builder-workflow-testrun-modal.tsx index 94e0ae849..ca7c5d57e 100644 --- a/keep-ui/app/(keep)/workflows/builder/builder-workflow-testrun-modal.tsx +++ b/keep-ui/app/(keep)/workflows/builder/builder-workflow-testrun-modal.tsx @@ -1,5 +1,5 @@ import { XMarkIcon } from "@heroicons/react/24/outline"; -import { Button, Card, Subtitle, Title } from "@tremor/react"; +import { Button, Card, Title } from "@tremor/react"; import ReactLoading from "react-loading"; import { ExecutionResults } from "./workflow-execution-results"; import { WorkflowExecution, WorkflowExecutionFailure } from "./types"; diff --git a/keep-ui/app/(keep)/workflows/builder/builder.tsx b/keep-ui/app/(keep)/workflows/builder/builder.tsx index 0a9e2918d..8f6553f06 100644 --- a/keep-ui/app/(keep)/workflows/builder/builder.tsx +++ b/keep-ui/app/(keep)/workflows/builder/builder.tsx @@ -14,7 +14,7 @@ import { } from "@heroicons/react/20/solid"; import { globalValidatorV2, stepValidatorV2 } from "./builder-validators"; import Modal from "react-modal"; -import { Alert } from "./alert"; +import { Alert } from "./legacy-workflow.types"; import BuilderModalContent from "./builder-modal"; import Loader from "./loader"; import { stringify } from "yaml"; @@ -35,8 +35,8 @@ import { toast } from "react-toastify"; import { useApi } from "@/shared/lib/hooks/useApi"; import { KeepApiError } from "@/shared/api"; import { showErrorToast } from "@/shared/ui"; -import "./page.css"; import { YAMLException } from "js-yaml"; +import WorkflowDefinitionYAML from "../workflow-definition-yaml"; interface Props { loadedAlertFile: string | null; @@ -58,6 +58,15 @@ const INITIAL_DEFINITION = wrapDefinitionV2({ isValid: false, }); +const YAMLSidebar = ({ yaml }: { yaml?: string }) => { + return ( +
+

YAML

+ {yaml && } +
+ ); +}; + function Builder({ loadedAlertFile, fileName, @@ -349,26 +358,31 @@ function Builder({ {generateModalIsOpen || testRunModalOpen ? null : ( <> {getworkflowStatus()} -
- - { - setDefinition({ - value: { - sequence: def?.sequence || [], - properties: def?.properties || {}, - }, - isValid: def?.isValid || false, - }); - }} - toolboxConfiguration={getToolboxConfiguration(providers)} - /> - -
+ +
+
+ + { + setDefinition({ + value: { + sequence: def?.sequence || [], + properties: def?.properties || {}, + }, + isValid: def?.isValid || false, + }); + }} + toolboxConfiguration={getToolboxConfiguration(providers)} + /> + +
+ {/* TODO: Add AI chat sidebar */} +
+
)}
diff --git a/keep-ui/app/(keep)/workflows/builder/alert.tsx b/keep-ui/app/(keep)/workflows/builder/legacy-workflow.types.ts similarity index 91% rename from keep-ui/app/(keep)/workflows/builder/alert.tsx rename to keep-ui/app/(keep)/workflows/builder/legacy-workflow.types.ts index 47447bce9..4fa28c200 100644 --- a/keep-ui/app/(keep)/workflows/builder/alert.tsx +++ b/keep-ui/app/(keep)/workflows/builder/legacy-workflow.types.ts @@ -1,5 +1,3 @@ -import { V2Step } from "@/app/(keep)/workflows/builder/types"; - interface Provider { type: string; config: string; diff --git a/keep-ui/app/(keep)/workflows/builder/loader.tsx b/keep-ui/app/(keep)/workflows/builder/loader.tsx index 3775c32bb..48a485c4e 100644 --- a/keep-ui/app/(keep)/workflows/builder/loader.tsx +++ b/keep-ui/app/(keep)/workflows/builder/loader.tsx @@ -3,8 +3,10 @@ import { Title, Text } from "@tremor/react"; export default function Loader() { return (
- Please start by loading or creating a new alert - You can use the `Load` or `+` button from the top right menu + Please start by loading or creating a new workflow + + Load YAML or use the "New " button from the top right menu +
); } diff --git a/keep-ui/app/(keep)/workflows/builder/page.client.tsx b/keep-ui/app/(keep)/workflows/builder/page.client.tsx index 3982dcaaa..24fcde10b 100644 --- a/keep-ui/app/(keep)/workflows/builder/page.client.tsx +++ b/keep-ui/app/(keep)/workflows/builder/page.client.tsx @@ -1,10 +1,9 @@ "use client"; -import { Title, Button, Subtitle, Badge } from "@tremor/react"; +import { Title, Button } from "@tremor/react"; import { useEffect, useRef, useState } from "react"; import { PlusIcon, - ArrowDownOnSquareIcon, BoltIcon, ArrowUpOnSquareIcon, PlayIcon, @@ -37,13 +36,17 @@ export default function PageClient({ setFileName(""); }, []); - function loadAlert() { - document.getElementById("alertFile")?.click(); + function loadWorkflow() { + document.getElementById("workflowFile")?.click(); } - function newAlert() { - const confirmed = confirm("Are you sure you want to create a new alert?"); - if (confirmed) window.location.reload(); + function createNewWorkflow() { + const confirmed = confirm( + "Are you sure you want to create a new workflow?" + ); + if (confirmed) { + window.location.reload(); + } } const enableButtons = () => setButtonsEnabled(true); @@ -87,7 +90,7 @@ export default function PageClient({ setTriggerGenerate(incrementState)} > - Generate + Get YAML )}
diff --git a/keep-ui/app/(keep)/workflows/builder/page.css b/keep-ui/app/(keep)/workflows/builder/page.css deleted file mode 100644 index 8d1c3cd58..000000000 --- a/keep-ui/app/(keep)/workflows/builder/page.css +++ /dev/null @@ -1,27 +0,0 @@ -.sqd-designer-react { - height: inherit; - border-radius: 10px; -} - -.sqd-designer { - height: 100%; -} - -.sqd-root-start-stop-circle, -.sqd-label-rect { - fill: rgb(255, 132, 16) !important; -} - -.sqd-smart-editor { - overflow: scroll; -} - -.sqd-toolbox-item-text { - text-transform: capitalize; -} - -.tooltip { - position: relative; - display: inline-block; - border-bottom: 1px dotted black; /* If you want dots under the hoverable text */ -} diff --git a/keep-ui/app/(keep)/workflows/builder/page.tsx b/keep-ui/app/(keep)/workflows/builder/page.tsx index 25ca84cde..cd81fc373 100644 --- a/keep-ui/app/(keep)/workflows/builder/page.tsx +++ b/keep-ui/app/(keep)/workflows/builder/page.tsx @@ -1,6 +1,7 @@ import PageClient from "./page.client"; import { Suspense } from "react"; import Loading from "@/app/(keep)/loading"; +import { Metadata } from "next"; type PageProps = { params: { workflow: string; workflowId: string }; @@ -15,7 +16,7 @@ export default function Page({ params, searchParams }: PageProps) { ); } -export const metadata = { - title: "Keep - Builder", - description: "Build alerts with a visual workflow designer.", +export const metadata: Metadata = { + title: "Keep - Workflow Builder", + description: "Build workflows with a UI builder.", }; diff --git a/keep-ui/app/(keep)/workflows/builder/utils.tsx b/keep-ui/app/(keep)/workflows/builder/utils.tsx index 728765031..2b7eb7c21 100644 --- a/keep-ui/app/(keep)/workflows/builder/utils.tsx +++ b/keep-ui/app/(keep)/workflows/builder/utils.tsx @@ -1,7 +1,6 @@ import { load, JSON_SCHEMA } from "js-yaml"; import { Provider } from "../../providers/providers"; -import { Action, Alert } from "./alert"; -import { stringify } from "yaml"; +import { Action, Alert } from "./legacy-workflow.types"; import { v4 as uuidv4 } from "uuid"; import { Definition, diff --git a/keep-ui/app/(keep)/workflows/workflow-menu.tsx b/keep-ui/app/(keep)/workflows/workflow-menu.tsx index ee89eb84d..f3438bdb4 100644 --- a/keep-ui/app/(keep)/workflows/workflow-menu.tsx +++ b/keep-ui/app/(keep)/workflows/workflow-menu.tsx @@ -4,16 +4,11 @@ import { EllipsisHorizontalIcon } from "@heroicons/react/20/solid"; import { Icon } from "@tremor/react"; import { EyeIcon, - PencilIcon, PlayIcon, TrashIcon, WrenchIcon, } from "@heroicons/react/24/outline"; -import { - DownloadIcon, - LockClosedIcon, - LockOpen1Icon, -} from "@radix-ui/react-icons"; +import { DownloadIcon } from "@radix-ui/react-icons"; import React from "react"; interface WorkflowMenuProps { From c8506dfc3359502bbe90dbd22ef6f433a61557d9 Mon Sep 17 00:00:00 2001 From: Kirill Chernakov Date: Wed, 25 Dec 2024 17:56:56 +0400 Subject: [PATCH 03/21] fix: frontend warnings (#2902) --- .../ui/incident-table-component.tsx | 45 +++++++++---------- .../incident-list/ui/incidents-table.tsx | 11 +++-- keep-ui/next.config.js | 13 ++++++ 3 files changed, 40 insertions(+), 29 deletions(-) diff --git a/keep-ui/features/incident-list/ui/incident-table-component.tsx b/keep-ui/features/incident-list/ui/incident-table-component.tsx index 43c46764c..74d3e9623 100644 --- a/keep-ui/features/incident-list/ui/incident-table-component.tsx +++ b/keep-ui/features/incident-list/ui/incident-table-component.tsx @@ -100,29 +100,28 @@ export const IncidentTableComponent = (props: Props) => { {table.getRowModel().rows.map((row) => ( - <> - - {row.getVisibleCells().map((cell) => { - const { style, className } = - getCommonPinningStylesAndClassNames(cell.column); - return ( - - {flexRender(cell.column.columnDef.cell, cell.getContext())} - - ); - })} - - + + {row.getVisibleCells().map((cell) => { + const { style, className } = getCommonPinningStylesAndClassNames( + cell.column + ); + return ( + + {flexRender(cell.column.columnDef.cell, cell.getContext())} + + ); + })} + ))} diff --git a/keep-ui/features/incident-list/ui/incidents-table.tsx b/keep-ui/features/incident-list/ui/incidents-table.tsx index 8ddf23ddc..c46c619f8 100644 --- a/keep-ui/features/incident-list/ui/incidents-table.tsx +++ b/keep-ui/features/incident-list/ui/incidents-table.tsx @@ -285,7 +285,6 @@ export default function IncidentsTable({ enableSorting: true, enableMultiSort: false, manualSorting: true, - debugTable: true, }); const selectedRowIds = Object.entries( @@ -315,11 +314,11 @@ export default function IncidentsTable({ return; } - if ( - !confirm( - `Are you sure you want to delete ${selectedRowIds.length} incidents? This action cannot be undone.` - ) - ) { + const isConfirmed = confirm( + `Are you sure you want to delete ${selectedRowIds.length} incidents? This action cannot be undone.` + ); + + if (!isConfirmed) { return; } diff --git a/keep-ui/next.config.js b/keep-ui/next.config.js index 82cfd0720..4410d4784 100644 --- a/keep-ui/next.config.js +++ b/keep-ui/next.config.js @@ -24,6 +24,19 @@ const nextConfig = { ); } + // Ignore warnings about critical dependencies, since they are not critical + // https://github.com/getsentry/sentry-javascript/issues/12077#issuecomment-2407569917 + config.ignoreWarnings = [ + ...(config.ignoreWarnings || []), + { + module: /@opentelemetry\/instrumentation/, + message: /Critical dependency/, + }, + { + module: /@prisma\/instrumentation/, + message: /Critical dependency/, + }, + ]; return config; }, transpilePackages: ["next-auth"], From bc21d894f431ca745f6bda59efa703855de19b48 Mon Sep 17 00:00:00 2001 From: Tal Date: Wed, 25 Dec 2024 17:22:19 +0100 Subject: [PATCH 04/21] feat(api): simple rate limiting with slowapi (#2903) --- docs/deployment/configuration.mdx | 18 +++++++++ keep/api/api.py | 30 ++++++++------ keep/api/core/dependencies.py | 1 + keep/api/core/limiter.py | 8 ++++ keep/api/core/metrics.py | 14 +------ keep/api/routes/alerts.py | 3 ++ keep/api/routes/metrics.py | 37 +++++++++++++++-- poetry.lock | 67 ++++++++++++++++++++++++++++--- pyproject.toml | 4 +- 9 files changed, 145 insertions(+), 37 deletions(-) create mode 100644 keep/api/core/limiter.py diff --git a/docs/deployment/configuration.mdx b/docs/deployment/configuration.mdx index 61a7a501d..30b0584a7 100644 --- a/docs/deployment/configuration.mdx +++ b/docs/deployment/configuration.mdx @@ -220,6 +220,24 @@ ARQ (Asynchronous Task Queue) configuration controls Keep's background task proc | **ARQ_EXPIRES** | Default job expiration time (in seconds) | No | 3600 | Positive integer | | **ARQ_EXPIRES_AI** | AI job expiration time (in seconds) | No | 3600000 | Positive integer | +### Rate Limiting + +Rate limiting configuration controls how many requests can be made to Keep's API endpoints within a specified time period. This helps prevent abuse and ensures system stability. + + +| Env var | Purpose | Required | Default Value | Valid options | +|:-------------------:|:-------:|:----------:|:-------------:|:-------------:| +| **KEEP_USE_LIMITER** | Enables or disables rate limiting | No | "false" | "true" or "false" | +| **KEEP_LIMIT_CONCURRENCY** | Sets the rate limit for API endpoints | No | "100/minute" | Format: "{number}/{interval}" where interval can be "second", "minute", "hour", "day" | + + +Currently, rate limiting is applied to the following endpoints: +- POST `/alerts/event` - Generic event ingestion endpoint +- POST `/alerts/{provider_type}` - Provider-specific event ingestion endpoints + +These endpoints are rate-limited according to the `KEEP_LIMIT_CONCURRENCY` setting when `KEEP_USE_LIMITER` is enabled. + + ## Frontend Environment Variables Frontend configuration variables control the behavior and features of Keep's user interface. These settings are crucial for customizing the frontend's appearance, functionality, and integration with the backend services. diff --git a/keep/api/api.py b/keep/api/api.py index eadddd138..7be28833a 100644 --- a/keep/api/api.py +++ b/keep/api/api.py @@ -10,6 +10,9 @@ from fastapi import FastAPI, Request from fastapi.middleware.gzip import GZipMiddleware from fastapi.responses import JSONResponse +from prometheus_fastapi_instrumentator import Instrumentator +from slowapi import _rate_limit_exceeded_handler +from slowapi.errors import RateLimitExceeded from starlette.middleware.cors import CORSMiddleware from starlette_context import plugins from starlette_context.middleware import RawContextMiddleware @@ -28,6 +31,7 @@ from keep.api.core.config import config from keep.api.core.db import dispose_session from keep.api.core.dependencies import SINGLE_TENANT_UUID +from keep.api.core.limiter import limiter from keep.api.logging import CONFIG as logging_config from keep.api.middlewares import LoggingMiddleware from keep.api.routes import ( @@ -190,6 +194,7 @@ async def lifespan(app: FastAPI): This runs for every worker on startup and shutdown. Read more about lifespan here: https://fastapi.tiangolo.com/advanced/events/#lifespan """ + app.state.limiter = limiter # create a set of background tasks background_tasks = set() # if debug tasks are enabled, create a task to check for pending tasks @@ -241,6 +246,7 @@ async def root(): return {"message": app.description, "version": KEEP_VERSION} app.add_middleware(RawContextMiddleware, plugins=(plugins.RequestIdPlugin(),)) + app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) app.add_middleware( GZipMiddleware, minimum_size=30 * 1024 * 1024 ) # Approximately 30 MiB, https://cloud.google.com/run/quotas @@ -313,6 +319,8 @@ async def catch_exception(request: Request, exc: Exception): app.add_middleware(LoggingMiddleware) + if config("KEEP_METRICS", default="true", cast=bool): + Instrumentator().instrument(app=app, metric_namespace="keep") keep.api.observability.setup(app) return app @@ -325,16 +333,12 @@ def run(app: FastAPI): keep.api.config.on_starting() - # run the server - workers = config("KEEP_WORKERS", default=None, cast=int) - if workers: - uvicorn.run( - "keep.api.api:get_app", - host=HOST, - port=PORT, - log_config=logging_config, - lifespan="on", - workers=workers, - ) - else: - uvicorn.run(app, host=HOST, port=PORT, log_config=logging_config, lifespan="on") + uvicorn.run( + "keep.api.api:get_app", + host=HOST, + port=PORT, + log_config=logging_config, + lifespan="on", + workers=config("KEEP_WORKERS", default=None, cast=int), + limit_concurrency=config("KEEP_LIMIT_CONCURRENCY", default=None, cast=int), + ) diff --git a/keep/api/core/dependencies.py b/keep/api/core/dependencies.py index 9d61505c8..5e57cf171 100644 --- a/keep/api/core/dependencies.py +++ b/keep/api/core/dependencies.py @@ -12,6 +12,7 @@ SINGLE_TENANT_UUID = "keep" SINGLE_TENANT_EMAIL = "admin@keephq" + async def extract_generic_body(request: Request) -> dict | bytes | FormData: """ Extracts the body of the request based on the content type. diff --git a/keep/api/core/limiter.py b/keep/api/core/limiter.py new file mode 100644 index 000000000..ddce7888b --- /dev/null +++ b/keep/api/core/limiter.py @@ -0,0 +1,8 @@ +# https://slowapi.readthedocs.io/en/latest/#fastapi +from slowapi import Limiter +from slowapi.util import get_remote_address + +from keep.api.core.config import config + +limiter_enabled = config("KEEP_USE_LIMITER", default="false", cast=bool) +limiter = Limiter(key_func=get_remote_address, enabled=limiter_enabled) diff --git a/keep/api/core/metrics.py b/keep/api/core/metrics.py index 39c5d6177..8ff667855 100644 --- a/keep/api/core/metrics.py +++ b/keep/api/core/metrics.py @@ -1,51 +1,39 @@ import os -from prometheus_client import CollectorRegistry, Counter, Gauge, Summary, multiprocess +from prometheus_client import Counter, Gauge, Summary PROMETHEUS_MULTIPROC_DIR = os.environ.get("PROMETHEUS_MULTIPROC_DIR", "/tmp/prometheus") os.makedirs(PROMETHEUS_MULTIPROC_DIR, exist_ok=True) METRIC_PREFIX = "keep_" -# Create a single registry for all metrics -registry = CollectorRegistry() -multiprocess.MultiProcessCollector(registry, path=PROMETHEUS_MULTIPROC_DIR) - # Process event metrics events_in_counter = Counter( f"{METRIC_PREFIX}events_in_total", "Total number of events received", - registry=registry, ) events_out_counter = Counter( f"{METRIC_PREFIX}events_processed_total", "Total number of events processed", - registry=registry, ) events_error_counter = Counter( f"{METRIC_PREFIX}events_error_total", "Total number of events with error", - registry=registry, ) processing_time_summary = Summary( f"{METRIC_PREFIX}processing_time_seconds", "Average time spent processing events", - registry=registry, ) -# Running tasks metrics running_tasks_gauge = Gauge( f"{METRIC_PREFIX}running_tasks_current", "Current number of running tasks", - registry=registry, multiprocess_mode="livesum", ) -# Per-process running tasks metrics running_tasks_by_process_gauge = Gauge( f"{METRIC_PREFIX}running_tasks_by_process", "Current number of running tasks per process", labelnames=["pid"], - registry=registry, multiprocess_mode="livesum", ) diff --git a/keep/api/routes/alerts.py b/keep/api/routes/alerts.py index 8f3870328..dbc411a58 100644 --- a/keep/api/routes/alerts.py +++ b/keep/api/routes/alerts.py @@ -28,6 +28,7 @@ ) from keep.api.core.dependencies import extract_generic_body, get_pusher_client from keep.api.core.elastic import ElasticClient +from keep.api.core.limiter import limiter from keep.api.core.metrics import running_tasks_by_process_gauge, running_tasks_gauge from keep.api.models.alert import ( AlertDto, @@ -344,6 +345,7 @@ def create_process_event_task( response_model=AlertDto | list[AlertDto], status_code=202, ) +@limiter.limit(config("KEEP_LIMIT_CONCURRENCY", default="100/minute", cast=str)) async def receive_generic_event( event: AlertDto | list[AlertDto] | dict, bg_tasks: BackgroundTasks, @@ -432,6 +434,7 @@ async def webhook_challenge(): description="Receive an alert event from a provider", status_code=202, ) +@limiter.limit(config("KEEP_LIMIT_CONCURRENCY", default="100/minute", cast=str)) async def receive_event( provider_type: str, bg_tasks: BackgroundTasks, diff --git a/keep/api/routes/metrics.py b/keep/api/routes/metrics.py index 4f7f121c3..71444c2e5 100644 --- a/keep/api/routes/metrics.py +++ b/keep/api/routes/metrics.py @@ -2,14 +2,21 @@ import chevron from fastapi import APIRouter, Depends, Query, Request, Response -from prometheus_client import CONTENT_TYPE_LATEST, generate_latest +from fastapi.responses import JSONResponse +from prometheus_client import ( + CONTENT_TYPE_LATEST, + CollectorRegistry, + generate_latest, + multiprocess, +) +from keep.api.core.config import config from keep.api.core.db import ( get_last_alerts_for_incidents, get_last_incidents, get_workflow_executions_count, ) -from keep.api.core.metrics import registry +from keep.api.core.limiter import limiter from keep.api.models.alert import AlertDto from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory @@ -20,8 +27,14 @@ @router.get("/processing", include_in_schema=False) -async def get_processing_metrics(request: Request): - # Generate all metrics from the single registry +async def get_processing_metrics( + request: Request, + authenticated_entity: AuthenticatedEntity = Depends( + IdentityManagerFactory.get_auth_verifier(["read:metrics"]) + ), +): + registry = CollectorRegistry() + multiprocess.MultiProcessCollector(registry) metrics = generate_latest(registry) return Response(content=metrics, media_type=CONTENT_TYPE_LATEST) @@ -122,3 +135,19 @@ def get_metrics( export += f"workflows_executions_total {{status=\"other\"}} {workflow_execution_counts['other']}\n" return Response(content=export, media_type=CONTENT_TYPE_LATEST) + + +@router.get("/dumb", include_in_schema=False) +@limiter.limit(config("KEEP_LIMIT_CONCURRENCY", default="10/minute", cast=str)) +async def get_dumb(request: Request) -> JSONResponse: + """ + This endpoint is used to test the rate limiting. + + Args: + request (Request): The request object. + + Returns: + JSONResponse: A JSON response with the message "hello world" ({"hello": "world"}). + """ + # await asyncio.sleep(5) + return JSONResponse(content={"hello": "world"}) diff --git a/poetry.lock b/poetry.lock index 6ef1eb261..79d159014 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2379,6 +2379,34 @@ atomic-cache = ["atomicwrites"] nearley = ["js2py"] regex = ["regex"] +[[package]] +name = "limits" +version = "3.14.1" +description = "Rate limiting utilities" +optional = false +python-versions = ">=3.9" +files = [ + {file = "limits-3.14.1-py3-none-any.whl", hash = "sha256:051aca02da56e6932599a25cb8e70543959294f5d587d57bcd7e38df234e697b"}, + {file = "limits-3.14.1.tar.gz", hash = "sha256:cad16a9b3cf3924e27da48e78bdab33ef312ecb7194fdb50e509cc8111c8d0bb"}, +] + +[package.dependencies] +deprecated = ">=1.2" +packaging = ">=21,<25" +typing-extensions = "*" + +[package.extras] +all = ["aetcd", "coredis (>=3.4.0,<5)", "emcache (>=0.6.1)", "emcache (>=1)", "etcd3", "motor (>=3,<4)", "pymemcache (>3,<5.0.0)", "pymongo (>4.1,<5)", "redis (>3,!=4.5.2,!=4.5.3,<6.0.0)", "redis (>=4.2.0,!=4.5.2,!=4.5.3)"] +async-etcd = ["aetcd"] +async-memcached = ["emcache (>=0.6.1)", "emcache (>=1)"] +async-mongodb = ["motor (>=3,<4)"] +async-redis = ["coredis (>=3.4.0,<5)"] +etcd = ["etcd3"] +memcached = ["pymemcache (>3,<5.0.0)"] +mongodb = ["pymongo (>4.1,<5)"] +redis = ["redis (>3,!=4.5.2,!=4.5.3,<6.0.0)"] +rediscluster = ["redis (>=4.2.0,!=4.5.2,!=4.5.3)"] + [[package]] name = "logmine" version = "0.4.1" @@ -3307,6 +3335,21 @@ files = [ [package.extras] twisted = ["twisted"] +[[package]] +name = "prometheus-fastapi-instrumentator" +version = "7.0.0" +description = "Instrument your FastAPI with Prometheus metrics." +optional = false +python-versions = ">=3.8.1,<4.0.0" +files = [ + {file = "prometheus_fastapi_instrumentator-7.0.0-py3-none-any.whl", hash = "sha256:96030c43c776ee938a3dae58485ec24caed7e05bfc60fe067161e0d5b5757052"}, + {file = "prometheus_fastapi_instrumentator-7.0.0.tar.gz", hash = "sha256:5ba67c9212719f244ad7942d75ded80693b26331ee5dfc1e7571e4794a9ccbed"}, +] + +[package.dependencies] +prometheus-client = ">=0.8.0,<1.0.0" +starlette = ">=0.30.0,<1.0.0" + [[package]] name = "propcache" version = "0.2.1" @@ -4490,7 +4533,6 @@ files = [ {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f66efbc1caa63c088dead1c4170d148eabc9b80d95fb75b6c92ac0aad2437d76"}, {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:22353049ba4181685023b25b5b51a574bce33e7f51c759371a7422dcae5402a6"}, {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:932205970b9f9991b34f55136be327501903f7c66830e9760a8ffb15b07f05cd"}, - {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a52d48f4e7bf9005e8f0a89209bf9a73f7190ddf0489eee5eb51377385f59f2a"}, {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win32.whl", hash = "sha256:3eac5a91891ceb88138c113f9db04f3cebdae277f5d44eaa3651a4f573e6a5da"}, {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab007f2f5a87bd08ab1499bdf96f3d5c6ad4dcfa364884cb4549aa0154b13a28"}, {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:4a6679521a58256a90b0d89e03992c15144c5f3858f40d7c18886023d7943db6"}, @@ -4499,7 +4541,6 @@ files = [ {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52"}, {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cf12567a7b565cbf65d438dec6cfbe2917d3c1bdddfce84a9930b7d35ea59642"}, {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7dd5adc8b930b12c8fc5b99e2d535a09889941aa0d0bd06f4749e9a9397c71d2"}, - {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1492a6051dab8d912fc2adeef0e8c72216b24d57bd896ea607cb90bb0c4981d3"}, {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4"}, {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb"}, {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632"}, @@ -4508,7 +4549,6 @@ files = [ {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd"}, {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bf165fef1f223beae7333275156ab2022cffe255dcc51c27f066b4370da81e31"}, {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32621c177bbf782ca5a18ba4d7af0f1082a3f6e517ac2a18b3974d4edf349680"}, - {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b82a7c94a498853aa0b272fd5bc67f29008da798d4f93a2f9f289feb8426a58d"}, {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5"}, {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4"}, {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4c8c5d82f50bb53986a5e02d1b3092b03622c02c2eb78e29bec33fd9593bae1a"}, @@ -4517,7 +4557,6 @@ files = [ {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96777d473c05ee3e5e3c3e999f5d23c6f4ec5b0c38c098b3a5229085f74236c6"}, {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:3bc2a80e6420ca8b7d3590791e2dfc709c88ab9152c00eeb511c9875ce5778bf"}, {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e188d2699864c11c36cdfdada94d781fd5d6b0071cd9c427bceb08ad3d7c70e1"}, - {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4f6f3eac23941b32afccc23081e1f50612bdbe4e982012ef4f5797986828cd01"}, {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win32.whl", hash = "sha256:6442cb36270b3afb1b4951f060eccca1ce49f3d087ca1ca4563a6eb479cb3de6"}, {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win_amd64.whl", hash = "sha256:e5b8daf27af0b90da7bb903a876477a9e6d7270be6146906b276605997c7e9a3"}, {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fc4b630cd3fa2cf7fce38afa91d7cfe844a9f75d7f0f36393fa98815e911d987"}, @@ -4526,7 +4565,6 @@ files = [ {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2f1c3765db32be59d18ab3953f43ab62a761327aafc1594a2a1fbe038b8b8a7"}, {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d85252669dc32f98ebcd5d36768f5d4faeaeaa2d655ac0473be490ecdae3c285"}, {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e143ada795c341b56de9418c58d028989093ee611aa27ffb9b7f609c00d813ed"}, - {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2c59aa6170b990d8d2719323e628aaf36f3bfbc1c26279c0eeeb24d05d2d11c7"}, {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win32.whl", hash = "sha256:beffaed67936fbbeffd10966a4eb53c402fafd3d6833770516bf7314bc6ffa12"}, {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win_amd64.whl", hash = "sha256:040ae85536960525ea62868b642bdb0c2cc6021c9f9d507810c0c604e66f5a7b"}, {file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"}, @@ -4686,6 +4724,23 @@ files = [ {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, ] +[[package]] +name = "slowapi" +version = "0.1.9" +description = "A rate limiting extension for Starlette and Fastapi" +optional = false +python-versions = ">=3.7,<4.0" +files = [ + {file = "slowapi-0.1.9-py3-none-any.whl", hash = "sha256:cfad116cfb84ad9d763ee155c1e5c5cbf00b0d47399a769b227865f5df576e36"}, + {file = "slowapi-0.1.9.tar.gz", hash = "sha256:639192d0f1ca01b1c6d95bf6c71d794c3a9ee189855337b4821f7f457dddad77"}, +] + +[package.dependencies] +limits = ">=2.3" + +[package.extras] +redis = ["redis (>=3.4.1,<4.0.0)"] + [[package]] name = "sniffio" version = "1.3.1" @@ -5408,4 +5463,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.11,<3.12" -content-hash = "8e764f85116828d2a0f772cc96ae127707ac1ca6ee061b42823f99d79975dc73" +content-hash = "d1ecb84ec2278190d29b2131ef67b077971af74f076c0b4055c475073f36ad10" diff --git a/pyproject.toml b/pyproject.toml index f2f5d6475..0fef9c044 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "keep" -version = "0.32.8" +version = "0.33.0" description = "Alerting. for developers, by developers." authors = ["Keep Alerting LTD"] packages = [{include = "keep"}] @@ -89,6 +89,8 @@ psycopg = "^3.2.3" prometheus-client = "^0.21.1" psycopg2-binary = "^2.9.10" +prometheus-fastapi-instrumentator = "^7.0.0" +slowapi = "^0.1.9" [tool.poetry.group.dev.dependencies] pre-commit = "^3.0.4" pre-commit-hooks = "^4.4.0" From 995e8e26b9229394fe9979ae12bb4f732b090efc Mon Sep 17 00:00:00 2001 From: DavidZarzevsky <108043629+DavidZarzevsky@users.noreply.github.com> Date: Thu, 26 Dec 2024 17:49:32 +0200 Subject: [PATCH 05/21] fix: Round the provider form localhost warning corners (#2913) --- keep-ui/app/(keep)/providers/provider-form.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keep-ui/app/(keep)/providers/provider-form.tsx b/keep-ui/app/(keep)/providers/provider-form.tsx index 6ab1878da..a0267c8bd 100644 --- a/keep-ui/app/(keep)/providers/provider-form.tsx +++ b/keep-ui/app/(keep)/providers/provider-form.tsx @@ -514,7 +514,7 @@ const ProviderForm = ({
{provider.can_setup_webhook && !installedProvidersMode && ( -
+
Date: Fri, 27 Dec 2024 10:31:52 +0200 Subject: [PATCH 06/21] feat(api): memory consumption and performance improvements (#2908) Signed-off-by: Shahar Glazner Signed-off-by: Tal Co-authored-by: Tal --- .gitignore | 2 + .../alert_deduplicator/alert_deduplicator.py | 12 +- keep/api/core/demo_mode.py | 7 +- keep/api/core/metrics.py | 54 +++- keep/api/logging.py | 15 +- keep/api/observability.py | 34 ++- keep/api/routes/alerts.py | 93 ++++--- keep/cli/cli.py | 119 -------- keep/workflowmanager/workflowmanager.py | 41 +-- keep/workflowmanager/workflowscheduler.py | 257 ++++++++++++------ poetry.lock | 21 +- pyproject.toml | 4 +- tests/test_workflow_execution.py | 33 ++- tests/test_workflowmanager.py | 95 ++++--- 14 files changed, 419 insertions(+), 368 deletions(-) diff --git a/.gitignore b/.gitignore index 6bcb9b6e7..38d405e05 100644 --- a/.gitignore +++ b/.gitignore @@ -214,3 +214,5 @@ oauth2.cfg scripts/keep_slack_bot.py keepnew.db providers_cache.json + +tests/provision/* diff --git a/keep/api/alert_deduplicator/alert_deduplicator.py b/keep/api/alert_deduplicator/alert_deduplicator.py index 09e30ab9f..57b4e12d0 100644 --- a/keep/api/alert_deduplicator/alert_deduplicator.py +++ b/keep/api/alert_deduplicator/alert_deduplicator.py @@ -25,7 +25,6 @@ DeduplicationRuleRequestDto, ) from keep.providers.providers_factory import ProvidersFactory -from keep.searchengine.searchengine import SearchEngine DEFAULT_RULE_UUID = "00000000-0000-0000-0000-000000000000" @@ -42,7 +41,6 @@ class AlertDeduplicator: def __init__(self, tenant_id): self.logger = logging.getLogger(__name__) self.tenant_id = tenant_id - self.search_engine = SearchEngine(self.tenant_id) def _apply_deduplication_rule( self, alert: AlertDto, rule: DeduplicationRuleDto @@ -264,7 +262,7 @@ def _get_default_full_deduplication_rule( ingested=0, dedup_ratio=0.0, enabled=True, - is_provisioned=False + is_provisioned=False, ) def get_deduplications(self) -> list[DeduplicationRuleDto]: @@ -502,15 +500,15 @@ def update_deduplication_rule( rule_dto = self.create_deduplication_rule(rule, updated_by) self.logger.info("Default rule updated") return rule_dto - + rule_before_update = get_deduplication_rule_by_id(self.tenant_id, rule_id) - + if not rule_before_update: raise HTTPException( status_code=404, detail="Deduplication rule not found", ) - + if rule_before_update.is_provisioned: raise HTTPException( status_code=409, @@ -557,7 +555,7 @@ def delete_deduplication_rule(self, rule_id: str) -> bool: status_code=404, detail="Deduplication rule not found", ) - + if deduplication_rule_to_be_deleted.is_provisioned: raise HTTPException( status_code=409, diff --git a/keep/api/core/demo_mode.py b/keep/api/core/demo_mode.py index e18ea90db..4ea72a1ab 100644 --- a/keep/api/core/demo_mode.py +++ b/keep/api/core/demo_mode.py @@ -568,7 +568,6 @@ async def simulate_alerts_async( logger.info( "Sleeping for {} seconds before next iteration".format(sleep_interval) ) - await asyncio.sleep(sleep_interval) def launch_demo_mode_thread( @@ -623,11 +622,14 @@ async def simulate_alerts_worker(worker_id, keep_api_key, rps=1): url, alert = await REQUESTS_QUEUE.get() async with session.post(url, json=alert, headers=headers) as response: + response_time = time.time() - start total_requests += 1 if not response.ok: logger.error("Failed to send alert: {}".format(response.text)) else: - logger.info("Alert sent successfully") + logger.info( + f"Alert sent successfully in {response_time:.3f} seconds" + ) if rps: delay = 1 / rps - (time.time() - start) @@ -639,6 +641,7 @@ async def simulate_alerts_worker(worker_id, keep_api_key, rps=1): worker_id, total_requests / (time.time() - total_start), ) + logger.info("Total requests: %d", total_requests) if __name__ == "__main__": diff --git a/keep/api/core/metrics.py b/keep/api/core/metrics.py index 8ff667855..df2ab8729 100644 --- a/keep/api/core/metrics.py +++ b/keep/api/core/metrics.py @@ -1,6 +1,6 @@ import os -from prometheus_client import Counter, Gauge, Summary +from prometheus_client import Counter, Gauge, Histogram, Summary PROMETHEUS_MULTIPROC_DIR = os.environ.get("PROMETHEUS_MULTIPROC_DIR", "/tmp/prometheus") os.makedirs(PROMETHEUS_MULTIPROC_DIR, exist_ok=True) @@ -37,3 +37,55 @@ labelnames=["pid"], multiprocess_mode="livesum", ) + +### WORKFLOWS +METRIC_PREFIX = "keep_workflows_" + +# Workflow execution metrics +workflow_executions_total = Counter( + f"{METRIC_PREFIX}executions_total", + "Total number of workflow executions", + labelnames=["tenant_id", "workflow_id", "trigger_type"], +) + +workflow_execution_errors_total = Counter( + f"{METRIC_PREFIX}execution_errors_total", + "Total number of workflow execution errors", + labelnames=["tenant_id", "workflow_id", "error_type"], +) + +workflow_execution_status = Counter( + f"{METRIC_PREFIX}execution_status_total", + "Total number of workflow executions by status", + labelnames=["tenant_id", "workflow_id", "status"], +) + +# Workflow performance metrics +workflow_execution_duration = Histogram( + f"{METRIC_PREFIX}execution_duration_seconds", + "Time spent executing workflows", + labelnames=["tenant_id", "workflow_id"], + buckets=(1, 5, 10, 30, 60, 120, 300, 600), # 1s, 5s, 10s, 30s, 1m, 2m, 5m, 10m +) + +workflow_execution_step_duration = Histogram( + f"{METRIC_PREFIX}execution_step_duration_seconds", + "Time spent executing individual workflow steps", + labelnames=["tenant_id", "workflow_id", "step_name"], + buckets=(0.1, 0.5, 1, 2, 5, 10, 30, 60), +) + +# Workflow state metrics +workflows_running = Gauge( + f"{METRIC_PREFIX}running", + "Number of currently running workflows", + labelnames=["tenant_id"], + multiprocess_mode="livesum", +) + +workflow_queue_size = Gauge( + f"{METRIC_PREFIX}queue_size", + "Number of workflows waiting to be executed", + labelnames=["tenant_id"], + multiprocess_mode="livesum", +) diff --git a/keep/api/logging.py b/keep/api/logging.py index 5d74adaff..a91659457 100644 --- a/keep/api/logging.py +++ b/keep/api/logging.py @@ -194,6 +194,7 @@ def process(self, msg, kwargs): LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") +KEEP_LOG_FILE = os.environ.get("KEEP_LOG_FILE") LOG_FORMAT_OPEN_TELEMETRY = "open_telemetry" LOG_FORMAT_DEVELOPMENT_TERMINAL = "dev_terminal" @@ -234,7 +235,7 @@ def format(self, record): }, "dev_terminal": { "()": DevTerminalFormatter, - "format": "%(asctime)s - %(levelname)s - %(message)s", + "format": "%(asctime)s - %(thread)s %(threadName)s %(levelname)s - %(message)s", }, }, "handlers": { @@ -369,6 +370,18 @@ def _log( def setup_logging(): + # Add file handler if KEEP_LOG_FILE is set + if KEEP_LOG_FILE: + CONFIG["handlers"]["file"] = { + "level": "DEBUG", + "formatter": ("json"), + "class": "logging.FileHandler", + "filename": KEEP_LOG_FILE, + "mode": "a", + } + # Add file handler to root logger + CONFIG["loggers"][""]["handlers"].append("file") + logging.config.dictConfig(CONFIG) uvicorn_error_logger = logging.getLogger("uvicorn.error") uvicorn_error_logger.__class__ = CustomizedUvicornLogger diff --git a/keep/api/observability.py b/keep/api/observability.py index b5aa3e0fa..1987a7702 100644 --- a/keep/api/observability.py +++ b/keep/api/observability.py @@ -23,20 +23,26 @@ from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor + def get_protocol_from_endpoint(endpoint): - parsed_url = urlparse(endpoint) - if parsed_url.scheme == "http": - return HTTPOTLPSpanExporter - elif parsed_url.scheme == "grpc": - return GRPCOTLPSpanExporter - else: - raise ValueError(f"Unsupported protocol: {parsed_url.scheme}") + parsed_url = urlparse(endpoint) + if parsed_url.scheme == "http": + return HTTPOTLPSpanExporter + elif parsed_url.scheme == "grpc": + return GRPCOTLPSpanExporter + else: + raise ValueError(f"Unsupported protocol: {parsed_url.scheme}") + def setup(app: FastAPI): logger = logging.getLogger(__name__) # Configure the OpenTelemetry SDK - service_name = os.environ.get("OTEL_SERVICE_NAME", os.environ.get("SERVICE_NAME", "keep-api")) - otlp_collector_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", os.environ.get("OTLP_ENDPOINT", False)) + service_name = os.environ.get( + "OTEL_SERVICE_NAME", os.environ.get("SERVICE_NAME", "keep-api") + ) + otlp_collector_endpoint = os.environ.get( + "OTEL_EXPORTER_OTLP_ENDPOINT", os.environ.get("OTLP_ENDPOINT", False) + ) otlp_traces_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", None) otlp_logs_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_LOGS_ENDPOINT", None) otlp_metrics_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", None) @@ -45,7 +51,7 @@ def setup(app: FastAPI): resource = Resource.create({"service.name": service_name}) provider = TracerProvider(resource=resource) - + if otlp_collector_endpoint: logger.info(f"OTLP endpoint set to {otlp_collector_endpoint}") @@ -53,13 +59,13 @@ def setup(app: FastAPI): if otlp_traces_endpoint: logger.info(f"OTLP Traces endpoint set to {otlp_traces_endpoint}") SpanExporter = get_protocol_from_endpoint(otlp_traces_endpoint) - processor = BatchSpanProcessor( - SpanExporter(endpoint=otlp_traces_endpoint) - ) + processor = BatchSpanProcessor(SpanExporter(endpoint=otlp_traces_endpoint)) provider.add_span_processor(processor) if metrics_enabled.lower() == "true" and otlp_metrics_endpoint: - logger.info(f"Metrics enabled. OTLP Metrics endpoint set to {otlp_metrics_endpoint}") + logger.info( + f"Metrics enabled. OTLP Metrics endpoint set to {otlp_metrics_endpoint}" + ) reader = PeriodicExportingMetricReader( OTLPMetricExporter(endpoint=otlp_metrics_endpoint) ) diff --git a/keep/api/routes/alerts.py b/keep/api/routes/alerts.py index dbc411a58..018211a70 100644 --- a/keep/api/routes/alerts.py +++ b/keep/api/routes/alerts.py @@ -1,17 +1,17 @@ -import asyncio import base64 +import concurrent.futures import hashlib import hmac import json import logging import os import time +from concurrent.futures import Future, ThreadPoolExecutor from typing import List, Optional import celpy from arq import ArqRedis from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request -from fastapi.concurrency import run_in_threadpool from fastapi.responses import JSONResponse from pusher import Pusher @@ -53,6 +53,12 @@ logger = logging.getLogger(__name__) REDIS = os.environ.get("REDIS", "false") == "true" +EVENT_WORKERS = int(config("KEEP_EVENT_WORKERS", default=50, cast=int)) + +# Create dedicated threadpool +process_event_executor = ThreadPoolExecutor( + max_workers=EVENT_WORKERS, thread_name_prefix="process_event_worker" +) @router.get( @@ -257,43 +263,52 @@ def assign_alert( return {"status": "ok"} -def discard_task( +def discard_future( trace_id: str, - task: asyncio.Task, + future: Future, running_tasks: set, started_time: float, ): try: - running_tasks.discard(task) - running_tasks_gauge.dec() # Decrease total counter - running_tasks_by_process_gauge.labels( - pid=os.getpid() - ).dec() # Decrease process counter - - # Log any exception that occurred in the task - if task.exception(): + running_tasks.discard(future) + running_tasks_gauge.dec() + running_tasks_by_process_gauge.labels(pid=os.getpid()).dec() + + # Log any exception that occurred in the future + try: + exception = future.exception() + if exception: + logger.error( + "Task failed with exception", + extra={ + "trace_id": trace_id, + "error": str(exception), + "processing_time": time.time() - started_time, + }, + ) + else: + logger.info( + "Task completed", + extra={ + "processing_time": time.time() - started_time, + "trace_id": trace_id, + }, + ) + except concurrent.futures.CancelledError: logger.error( - "Task failed with exception", + "Task was cancelled", extra={ "trace_id": trace_id, - "error": str(task.exception()), - "processing_time": time.time() - started_time, - }, - ) - else: - logger.info( - "Task completed", - extra={ "processing_time": time.time() - started_time, - "trace_id": trace_id, }, ) + except Exception: # Make sure we always decrement both counters even if something goes wrong running_tasks_gauge.dec() running_tasks_by_process_gauge.labels(pid=os.getpid()).dec() logger.exception( - "Error in discard_task callback", + "Error in discard_future callback", extra={ "trace_id": trace_id, }, @@ -317,26 +332,24 @@ def create_process_event_task( running_tasks_by_process_gauge.labels( pid=os.getpid() ).inc() # Increase process counter - task = asyncio.create_task( - run_in_threadpool( - process_event, - {}, - tenant_id, - provider_type, - provider_id, - fingerprint, - api_key_name, - trace_id, - event, - ) + future = process_event_executor.submit( + process_event, + {}, # ctx + tenant_id, + provider_type, + provider_id, + fingerprint, + api_key_name, + trace_id, + event, ) - task.add_done_callback( - lambda task: discard_task(trace_id, task, running_tasks, started_time) + running_tasks.add(future) + future.add_done_callback( + lambda task: discard_future(trace_id, task, running_tasks, started_time) ) - bg_tasks.add_task(task) - running_tasks.add(task) + logger.info("Task added", extra={"trace_id": trace_id}) - return task.get_name() + return str(id(future)) @router.post( diff --git a/keep/cli/cli.py b/keep/cli/cli.py index de2ddec83..02f3d4def 100644 --- a/keep/cli/cli.py +++ b/keep/cli/cli.py @@ -15,14 +15,9 @@ from dotenv import find_dotenv, load_dotenv from prettytable import PrettyTable -from keep.api.core.db_on_start import try_create_single_tenant -from keep.api.core.dependencies import SINGLE_TENANT_UUID from keep.api.core.posthog import posthog_client -from keep.cli.click_extensions import NotRequiredIf from keep.providers.models.provider_config import ProviderScope from keep.providers.providers_factory import ProvidersFactory -from keep.workflowmanager.workflowmanager import WorkflowManager -from keep.workflowmanager.workflowstore import WorkflowStore load_dotenv(find_dotenv()) @@ -349,120 +344,6 @@ def api(multi_tenant: bool, port: int, host: str): api.run(app) -@cli.command() -@click.option( - "--alerts-directory", - "--alerts-file", - "-af", - type=click.Path(exists=True, dir_okay=True, file_okay=True), - help="The path to the alert yaml/alerts directory", -) -@click.option( - "--alert-url", - "-au", - help="A url that can be used to download an alert yaml", - cls=NotRequiredIf, - multiple=True, - not_required_if="alerts_directory", -) -@click.option( - "--interval", - "-i", - type=int, - help="When interval is set, Keep will run the alert every INTERVAL seconds", - required=False, - default=0, -) -@click.option( - "--providers-file", - "-p", - type=click.Path(exists=False), - help="The path to the providers yaml", - required=False, - default="providers.yaml", -) -@click.option( - "--tenant-id", - "-t", - help="The tenant id", - required=False, - default=SINGLE_TENANT_UUID, -) -@click.option("--api-key", help="The API key for keep's API", required=False) -@click.option( - "--api-url", - help="The URL for keep's API", - required=False, - default="https://s.keephq.dev", -) -@pass_info -def run( - info: Info, - alerts_directory: str, - alert_url: list[str], - interval: int, - providers_file, - tenant_id, - api_key, - api_url, -): - """Run a workflow.""" - logger.debug(f"Running alert in {alerts_directory or alert_url}") - posthog_client.capture( - info.random_user_id, - "keep-run-alert-started", - properties={ - "args": sys.argv, - "keep_version": KEEP_VERSION, - }, - ) - # this should be fixed - workflow_manager = WorkflowManager.get_instance() - workflow_store = WorkflowStore() - if tenant_id == SINGLE_TENANT_UUID: - try_create_single_tenant(SINGLE_TENANT_UUID) - workflows = workflow_store.get_workflows_from_path( - tenant_id, alerts_directory or alert_url, providers_file - ) - try: - workflow_manager.run(workflows) - except KeyboardInterrupt: - logger.info("Keep stopped by user, stopping the scheduler") - posthog_client.capture( - info.random_user_id, - "keep-run-stopped-by-user", - properties={ - "args": sys.argv, - "keep_version": KEEP_VERSION, - }, - ) - workflow_manager.stop() - logger.info("Scheduler stopped") - except Exception as e: - posthog_client.capture( - info.random_user_id, - "keep-run-unexpected-error", - properties={ - "args": sys.argv, - "error": str(e), - "keep_version": KEEP_VERSION, - }, - ) - logger.error(f"Error running alert {alerts_directory or alert_url}: {e}") - if info.verbose: - raise e - sys.exit(1) - posthog_client.capture( - info.random_user_id, - "keep-run-alert-finished", - properties={ - "args": sys.argv, - "keep_version": KEEP_VERSION, - }, - ) - logger.debug(f"Alert in {alerts_directory or alert_url} ran successfully") - - @cli.group() @pass_info def workflow(info: Info): diff --git a/keep/workflowmanager/workflowmanager.py b/keep/workflowmanager/workflowmanager.py index 01d1f9f16..a8f7d0a07 100644 --- a/keep/workflowmanager/workflowmanager.py +++ b/keep/workflowmanager/workflowmanager.py @@ -10,11 +10,12 @@ get_previous_alert_by_fingerprint, save_workflow_results, ) +from keep.api.core.metrics import workflow_execution_duration from keep.api.models.alert import AlertDto, AlertSeverity, IncidentDto from keep.identitymanager.identitymanagerfactory import IdentityManagerTypes from keep.providers.providers_factory import ProviderConfigurationException from keep.workflowmanager.workflow import Workflow -from keep.workflowmanager.workflowscheduler import WorkflowScheduler +from keep.workflowmanager.workflowscheduler import WorkflowScheduler, timing_histogram from keep.workflowmanager.workflowstore import WorkflowStore @@ -33,6 +34,7 @@ def __init__(self): self.debug = config("WORKFLOW_MANAGER_DEBUG", default=False, cast=bool) if self.debug: self.logger.setLevel(logging.DEBUG) + self.scheduler = WorkflowScheduler(self) self.workflow_store = WorkflowStore() self.started = False @@ -42,13 +44,18 @@ async def start(self): if self.started: self.logger.info("Workflow manager already started") return + await self.scheduler.start() self.started = True def stop(self): """Stops the workflow manager""" + if not self.started: + return self.scheduler.stop() self.started = False + # Clear the scheduler reference + self.scheduler = None def _apply_filter(self, filter_val, value): # if it's a regex, apply it @@ -333,37 +340,6 @@ def _get_event_value(self, event, filter_key): else: return getattr(event, filter_key, None) - # TODO should be fixed to support the usual CLI - def run(self, workflows: list[Workflow]): - """ - Run list of workflows. - - Args: - workflow (str): Either an workflow yaml or a directory containing workflow yamls or a list of URLs to get the workflows from. - providers_file (str, optional): The path to the providers yaml. Defaults to None. - """ - self.logger.info("Running workflow(s)") - workflows_errors = [] - # If at least one workflow has an interval, run workflows using the scheduler, - # otherwise, just run it - if any([Workflow.workflow_interval for Workflow in workflows]): - # running workflows in scheduler mode - self.logger.info( - "Found at least one workflow with an interval, running in scheduler mode" - ) - self.scheduler_mode = True - # if the workflows doesn't have an interval, set the default interval - for workflow in workflows: - workflow.workflow_interval = workflow.workflow_interval - # This will halt until KeyboardInterrupt - self.scheduler.run_workflows(workflows) - self.logger.info("Workflow(s) scheduled") - else: - # running workflows in the regular mode - workflows_errors = self._run_workflows_from_cli(workflows) - - return workflows_errors - def _check_premium_providers(self, workflow: Workflow): """ Check if the workflow uses premium providers in multi tenant mode. @@ -428,6 +404,7 @@ def _run_workflow_on_failure( }, ) + @timing_histogram(workflow_execution_duration) def _run_workflow( self, workflow: Workflow, workflow_execution_id: str, test_run=False ): diff --git a/keep/workflowmanager/workflowscheduler.py b/keep/workflowmanager/workflowscheduler.py index 2fe05e1ed..f8c916781 100644 --- a/keep/workflowmanager/workflowscheduler.py +++ b/keep/workflowmanager/workflowscheduler.py @@ -2,10 +2,10 @@ import hashlib import logging import queue -import threading import time -import typing import uuid +from concurrent.futures import ThreadPoolExecutor +from functools import wraps from threading import Lock from sqlalchemy.exc import IntegrityError @@ -16,6 +16,13 @@ from keep.api.core.db import get_enrichment, get_previous_execution_id from keep.api.core.db import get_workflow as get_workflow_db from keep.api.core.db import get_workflows_that_should_run +from keep.api.core.metrics import ( + workflow_execution_errors_total, + workflow_execution_status, + workflow_executions_total, + workflow_queue_size, + workflows_running, +) from keep.api.models.alert import AlertDto, IncidentDto from keep.api.utils.email_utils import KEEP_EMAILS_ENABLED, EmailTemplates, send_email from keep.providers.providers_factory import ProviderConfigurationException @@ -23,6 +30,7 @@ from keep.workflowmanager.workflowstore import WorkflowStore READ_ONLY_MODE = config("KEEP_READ_ONLY", default="false") == "true" +MAX_WORKERS = config("WORKFLOWS_MAX_WORKERS", default="20") class WorkflowStatus(enum.Enum): @@ -31,12 +39,40 @@ class WorkflowStatus(enum.Enum): PROVIDERS_NOT_CONFIGURED = "providers_not_configured" +def timing_histogram(histogram): + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + start_time = time.time() + try: + result = func(*args, **kwargs) + return result + finally: + duration = time.time() - start_time + # Try to get tenant_id and workflow_id from self + try: + tenant_id = args[1].context_manager.tenant_id + except Exception: + tenant_id = "unknown" + try: + workflow_id = args[1].workflow_id + except Exception: + workflow_id = "unknown" + histogram.labels(tenant_id=tenant_id, workflow_id=workflow_id).observe( + duration + ) + + return wrapper + + return decorator + + class WorkflowScheduler: MAX_SIZE_SIGNED_INT = 2147483647 + MAX_WORKERS = config("KEEP_MAX_WORKFLOW_WORKERS", default="20", cast=int) def __init__(self, workflow_manager): self.logger = logging.getLogger(__name__) - self.threads = [] self.workflow_manager = workflow_manager self.workflow_store = WorkflowStore() # all workflows that needs to be run due to alert event @@ -46,14 +82,29 @@ def __init__(self, workflow_manager): self.interval_enabled = ( config("WORKFLOWS_INTERVAL_ENABLED", default="true") == "true" ) + self.executor = ThreadPoolExecutor( + max_workers=self.MAX_WORKERS, + thread_name_prefix="WorkflowScheduler", + ) + self.scheduler_future = None + self.futures = set() + # Initialize metrics for queue size + self._update_queue_metrics() + + def _update_queue_metrics(self): + """Update queue size metrics""" + with self.lock: + for workflow in self.workflows_to_run: + tenant_id = workflow.get("tenant_id", "unknown") + workflow_queue_size.labels(tenant_id=tenant_id).set( + len(self.workflows_to_run) + ) async def start(self): self.logger.info("Starting workflows scheduler") # Shahar: fix for a bug in unit tests self._stop = False - thread = threading.Thread(target=self._start) - thread.start() - self.threads.append(thread) + self.scheduler_future = self.executor.submit(self._start) self.logger.info("Workflows scheduler started") def _handle_interval_workflows(self): @@ -70,13 +121,12 @@ def _handle_interval_workflows(self): self.logger.exception("Error getting workflows that should run") pass for workflow in workflows: - self.logger.debug("Running workflow on background") - workflow_execution_id = workflow.get("workflow_execution_id") tenant_id = workflow.get("tenant_id") workflow_id = workflow.get("workflow_id") + try: - workflow = self.workflow_store.get_workflow(tenant_id, workflow_id) + workflow_obj = self.workflow_store.get_workflow(tenant_id, workflow_id) except ProviderConfigurationException: self.logger.exception( "Provider configuration is invalid", @@ -91,7 +141,7 @@ def _handle_interval_workflows(self): workflow_id=workflow_id, workflow_execution_id=workflow_execution_id, status=WorkflowStatus.PROVIDERS_NOT_CONFIGURED, - error=f"Providers are not configured for workflow {workflow_id}, please configure it so Keep will be able to run it", + error=f"Providers are not configured for workflow {workflow_id}", ) continue except Exception as e: @@ -104,12 +154,16 @@ def _handle_interval_workflows(self): error=f"Error getting workflow: {e}", ) continue - thread = threading.Thread( - target=self._run_workflow, - args=[tenant_id, workflow_id, workflow, workflow_execution_id], + + future = self.executor.submit( + self._run_workflow, + tenant_id, + workflow_id, + workflow_obj, + workflow_execution_id, ) - thread.start() - self.threads.append(thread) + self.futures.add(future) + future.add_done_callback(lambda f: self.futures.remove(f)) def _run_workflow( self, @@ -120,23 +174,52 @@ def _run_workflow( event_context=None, ): if READ_ONLY_MODE: - # This is because sometimes workflows takes 0 seconds and the executions chart is not updated properly. self.logger.debug("Sleeping for 3 seconds in favor of read only mode") time.sleep(3) + self.logger.info(f"Running workflow {workflow.workflow_id}...") try: + # Increment running workflows counter + workflows_running.labels(tenant_id=tenant_id).inc() + + # Track execution + # Shahar: currently incident doesn't have trigger so we will workaround it + if isinstance(event_context, AlertDto): + workflow_executions_total.labels( + tenant_id=tenant_id, + workflow_id=workflow_id, + trigger_type=event_context.trigger if event_context else "interval", + ).inc() + else: + # TODO: add trigger to incident + workflow_executions_total.labels( + tenant_id=tenant_id, + workflow_id=workflow_id, + trigger_type="incident", + ).inc() + + # Run the workflow if isinstance(event_context, AlertDto): - # set the event context, e.g. the event that triggered the workflow workflow.context_manager.set_event_context(event_context) else: - # set the incident context, e.g. the incident that triggered the workflow workflow.context_manager.set_incident_context(event_context) errors, _ = self.workflow_manager._run_workflow( workflow, workflow_execution_id ) except Exception as e: + # Track error metrics + workflow_execution_errors_total.labels( + tenant_id=tenant_id, + workflow_id=workflow_id, + error_type=type(e).__name__, + ).inc() + + workflow_execution_status.labels( + tenant_id=tenant_id, workflow_id=workflow_id, status="error" + ).inc() + self.logger.exception(f"Failed to run workflow {workflow.workflow_id}...") self._finish_workflow_execution( tenant_id=tenant_id, @@ -146,6 +229,10 @@ def _run_workflow( error=str(e), ) return + finally: + # Decrement running workflows counter + workflows_running.labels(tenant_id=tenant_id).dec() + self._update_queue_metrics() if any(errors): self.logger.info(msg=f"Workflow {workflow.workflow_id} ran with errors") @@ -164,10 +251,10 @@ def _run_workflow( status=WorkflowStatus.SUCCESS, error=None, ) + self.logger.info(f"Workflow {workflow.workflow_id} ran") def handle_workflow_test(self, workflow, tenant_id, triggered_by_user): - workflow_execution_id = self._get_unique_execution_number() self.logger.info( @@ -195,31 +282,35 @@ def run_workflow_wrapper( print(f"Exception in workflow: {e}") result_queue.put((str(e), None)) - thread = threading.Thread( - target=run_workflow_wrapper, - args=[ - self.workflow_manager._run_workflow, - workflow, - workflow_execution_id, - True, - result_queue, - ], + future = self.executor.submit( + run_workflow_wrapper, + self.workflow_manager._run_workflow, + workflow, + workflow_execution_id, + True, + result_queue, ) - thread.start() - thread.join() + future.result() # Wait for completion errors, results = result_queue.get() - self.logger.info( - f"Workflow {workflow.workflow_id} ran", - extra={"errors": errors, "results": results}, - ) - status = "success" error = None if any(errors): error = "\n".join(str(e) for e in errors) status = "error" + self.logger.info( + "Workflow test complete", + extra={ + "workflow_id": workflow.workflow_id, + "workflow_execution_id": workflow_execution_id, + "tenant_id": tenant_id, + "status": status, + "error": error, + "results": results, + }, + ) + return { "workflow_execution_id": workflow_execution_id, "status": status, @@ -320,6 +411,10 @@ def _handle_event_workflows(self): workflow = workflow_to_run.get("workflow") workflow_id = workflow_to_run.get("workflow_id") tenant_id = workflow_to_run.get("tenant_id") + # Update queue size metrics + workflow_queue_size.labels(tenant_id=tenant_id).set( + len(self.workflows_to_run) + ) workflow_execution_id = workflow_to_run.get("workflow_execution_id") if not workflow: self.logger.info("Loading workflow") @@ -499,18 +594,30 @@ def _handle_event_workflows(self): ) continue # Last, run the workflow - thread = threading.Thread( - target=self._run_workflow, - args=[tenant_id, workflow_id, workflow, workflow_execution_id, event], + future = self.executor.submit( + self._run_workflow, + tenant_id, + workflow_id, + workflow, + workflow_execution_id, + event, ) - thread.start() - self.threads.append(thread) + self.futures.add(future) + future.add_done_callback(lambda f: self.futures.remove(f)) + + self.logger.info( + "Event workflows handled", + extra={"current_number_of_workflows": len(self.futures)}, + ) def _start(self): self.logger.info("Starting workflows scheduler") while not self._stop: # get all workflows that should run now - self.logger.debug("Getting workflows that should run...") + self.logger.info( + "Starting workflow scheduler iteration", + extra={"current_number_of_workflows": len(self.futures)}, + ) try: self._handle_interval_workflows() self._handle_event_workflows() @@ -523,55 +630,41 @@ def _start(self): time.sleep(1) self.logger.info("Workflows scheduler stopped") - def run_workflows(self, workflows: typing.List[Workflow]): - for workflow in workflows: - thread = threading.Thread( - target=self._run_workflows_with_interval, - args=[workflow], - daemon=True, - ) - thread.start() - self.threads.append(thread) - # as long as the stop flag is not set, sleep - while not self._stop: - time.sleep(1) - def stop(self): self.logger.info("Stopping scheduled workflows") self._stop = True - # Now wait for the threads to finish - for thread in self.threads: - thread.join() - self.logger.info("Scheduled workflows stopped") - def _run_workflows_with_interval( - self, - workflow: Workflow, - ): - """Simple scheduling of workflows with interval + # Wait for scheduler to stop first + if self.scheduler_future: + try: + self.scheduler_future.result( + timeout=5 + ) # Add timeout to prevent hanging + except Exception: + self.logger.exception("Error waiting for scheduler to stop") - TODO: Use https://github.com/agronholm/apscheduler + # Cancel all running workflows with timeout + for future in list(self.futures): # Create a copy of futures set + try: + self.logger.info("Cancelling future") + future.cancel() + future.result(timeout=1) # Add timeout + self.logger.info("Future cancelled") + except Exception: + self.logger.exception("Error cancelling future") - Args: - workflow (Workflow): The workflow to run. - """ - while True and not self._stop: - self.logger.info(f"Running workflow {workflow.workflow_id}...") + # Shutdown the executor with timeout + if self.executor: try: - self.workflow_manager._run_workflow(workflow, uuid.uuid4()) + self.logger.info("Shutting down executor") + self.executor.shutdown(wait=True, cancel_futures=True) + self.executor = None + self.logger.info("Executor shut down") except Exception: - self.logger.exception( - f"Failed to run workflow {workflow.workflow_id}..." - ) - self.logger.info(f"Workflow {workflow.workflow_id} ran") - if workflow.workflow_interval > 0: - self.logger.info( - f"Sleeping for {workflow.workflow_interval} seconds..." - ) - time.sleep(workflow.workflow_interval) - else: - self.logger.info("Workflow will not run again") - break + self.logger.exception("Error shutting down executor") + + self.futures.clear() + self.logger.info("Scheduled workflows stopped") def _finish_workflow_execution( self, diff --git a/poetry.lock b/poetry.lock index 79d159014..56d053477 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1338,22 +1338,23 @@ testing = ["hatch", "pre-commit", "pytest", "tox"] [[package]] name = "fastapi" -version = "0.109.2" +version = "0.115.6" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" optional = false python-versions = ">=3.8" files = [ - {file = "fastapi-0.109.2-py3-none-any.whl", hash = "sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d"}, - {file = "fastapi-0.109.2.tar.gz", hash = "sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73"}, + {file = "fastapi-0.115.6-py3-none-any.whl", hash = "sha256:e9240b29e36fa8f4bb7290316988e90c381e5092e0cbe84e7818cc3713bcf305"}, + {file = "fastapi-0.115.6.tar.gz", hash = "sha256:9ec46f7addc14ea472958a96aae5b5de65f39721a46aaf5705c480d9a8b76654"}, ] [package.dependencies] pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0" -starlette = ">=0.36.3,<0.37.0" +starlette = ">=0.40.0,<0.42.0" typing-extensions = ">=4.8.0" [package.extras] -all = ["email-validator (>=2.0.0)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.7)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"] +all = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.7)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"] +standard = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "jinja2 (>=2.11.2)", "python-multipart (>=0.0.7)", "uvicorn[standard] (>=0.12.0)"] [[package]] name = "filelock" @@ -2738,7 +2739,7 @@ name = "ndg-httpsclient" version = "0.5.1" description = "Provides enhanced HTTPS support for httplib and urllib2 using PyOpenSSL" optional = false -python-versions = ">=2.7,<3.0.dev0 || >=3.4.dev0" +python-versions = ">=2.7,<3.0.0 || >=3.4.0" files = [ {file = "ndg_httpsclient-0.5.1-py2-none-any.whl", hash = "sha256:d2c7225f6a1c6cf698af4ebc962da70178a99bcde24ee6d1961c4f3338130d57"}, {file = "ndg_httpsclient-0.5.1-py3-none-any.whl", hash = "sha256:dd174c11d971b6244a891f7be2b32ca9853d3797a72edb34fa5d7b07d8fff7d4"}, @@ -4981,13 +4982,13 @@ files = [ [[package]] name = "starlette" -version = "0.36.3" +version = "0.41.3" description = "The little ASGI library that shines." optional = false python-versions = ">=3.8" files = [ - {file = "starlette-0.36.3-py3-none-any.whl", hash = "sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044"}, - {file = "starlette-0.36.3.tar.gz", hash = "sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080"}, + {file = "starlette-0.41.3-py3-none-any.whl", hash = "sha256:44cedb2b7c77a9de33a8b74b2b90e9f50d11fcf25d8270ea525ad71a25374ff7"}, + {file = "starlette-0.41.3.tar.gz", hash = "sha256:0e4ab3d16522a255be6b28260b938eae2482f98ce5cc934cb08dce8dc3ba5835"}, ] [package.dependencies] @@ -5463,4 +5464,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.11,<3.12" -content-hash = "d1ecb84ec2278190d29b2131ef67b077971af74f076c0b4055c475073f36ad10" +content-hash = "089d3d061da28029a73cbe1b566b3d6ef531145407b322934821b1003ff9681d" diff --git a/pyproject.toml b/pyproject.toml index 0fef9c044..c69b12a7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "keep" -version = "0.33.0" +version = "0.33.1" description = "Alerting. for developers, by developers." authors = ["Keep Alerting LTD"] packages = [{include = "keep"}] @@ -24,7 +24,7 @@ python-json-logger = "^2.0.6" boto3 = "^1.26.72" validators = "^0.20.0" python-telegram-bot = "^20.1" -fastapi = "^0.109.1" +fastapi = "^0.115.6" uvicorn = "0.32.1" opsgenie-sdk = "^2.1.5" starlette-context = "^0.3.6" diff --git a/tests/test_workflow_execution.py b/tests/test_workflow_execution.py index c9ae1cc53..0272f30cd 100644 --- a/tests/test_workflow_execution.py +++ b/tests/test_workflow_execution.py @@ -79,15 +79,26 @@ @pytest.fixture(scope="module") def workflow_manager(): """ - Fixture to create and manage a WorkflowManager instance for the duration of the module. - It starts the manager asynchronously and stops it after all tests are completed. + Fixture to create and manage a WorkflowManager instance. """ - manager = WorkflowManager.get_instance() - asyncio.run(manager.start()) - while not manager.started: - time.sleep(0.1) - yield manager - manager.stop() + manager = None + try: + from keep.workflowmanager.workflowscheduler import WorkflowScheduler + + scheduler = WorkflowScheduler(None) + manager = WorkflowManager.get_instance() + scheduler.workflow_manager = manager + manager.scheduler = scheduler + asyncio.run(manager.start()) + yield manager + finally: + if manager: + try: + manager.stop() + # Give some time for threads to clean up + time.sleep(1) + except Exception as e: + print(f"Error stopping workflow manager: {e}") @pytest.fixture @@ -690,7 +701,11 @@ def test_workflow_execution_with_disabled_workflow( count = 0 while ( - enabled_workflow_execution is None and disabled_workflow_execution is None + ( + enabled_workflow_execution is None + or enabled_workflow_execution.status == "in_progress" + ) + and disabled_workflow_execution is None ) and count < 30: enabled_workflow_execution = get_last_workflow_execution_by_workflow_id( SINGLE_TENANT_UUID, enabled_id diff --git a/tests/test_workflowmanager.py b/tests/test_workflowmanager.py index 2b4868146..eb43abe15 100644 --- a/tests/test_workflowmanager.py +++ b/tests/test_workflowmanager.py @@ -1,16 +1,17 @@ -import pytest +import queue +from pathlib import Path from unittest.mock import Mock, patch + +import pytest from fastapi import HTTPException -import threading -import queue + +from keep.parser.parser import Parser # Assuming WorkflowParser is the class containing the get_workflow_from_dict method from keep.workflowmanager.workflow import Workflow -from keep.workflowmanager.workflowstore import WorkflowStore from keep.workflowmanager.workflowmanager import WorkflowManager from keep.workflowmanager.workflowscheduler import WorkflowScheduler -from keep.parser.parser import Parser -from pathlib import Path +from keep.workflowmanager.workflowstore import WorkflowStore path_to_test_resources = Path(__file__).parent / "workflows" @@ -109,29 +110,27 @@ def test_handle_workflow_test(): tenant_id = "test_tenant" triggered_by_user = "test_user" - with patch.object(threading, "Thread", wraps=threading.Thread) as mock_thread: - with patch.object(queue, "Queue", wraps=queue.Queue) as mock_queue: - result = workflow_scheduler.handle_workflow_test( - workflow=mock_workflow, - tenant_id=tenant_id, - triggered_by_user=triggered_by_user, - ) + with patch.object(queue, "Queue", wraps=queue.Queue) as mock_queue: + result = workflow_scheduler.handle_workflow_test( + workflow=mock_workflow, + tenant_id=tenant_id, + triggered_by_user=triggered_by_user, + ) - mock_workflow_manager._run_workflow.assert_called_once_with( - mock_workflow, 123, True - ) + mock_workflow_manager._run_workflow.assert_called_once_with( + mock_workflow, 123, True + ) - assert mock_thread.call_count == 1 - assert mock_queue.call_count == 1 + assert mock_queue.call_count == 1 - expected_result = { - "workflow_execution_id": 123, - "status": "success", - "error": None, - "results": {"result": "value1"}, - } - assert result == expected_result - assert mock_logger.info.call_count == 2 + expected_result = { + "workflow_execution_id": 123, + "status": "success", + "error": None, + "results": {"result": "value1"}, + } + assert result == expected_result + assert mock_logger.info.call_count == 2 def test_handle_workflow_test_with_error(): @@ -152,26 +151,24 @@ def test_handle_workflow_test_with_error(): tenant_id = "test_tenant" triggered_by_user = "test_user" - with patch.object(threading, "Thread", wraps=threading.Thread) as mock_thread: - with patch.object(queue, "Queue", wraps=queue.Queue) as mock_queue: - result = workflow_scheduler.handle_workflow_test( - workflow=mock_workflow, - tenant_id=tenant_id, - triggered_by_user=triggered_by_user, - ) - - mock_workflow_manager._run_workflow.assert_called_once_with( - mock_workflow, 123, True - ) - - assert mock_thread.call_count == 1 - assert mock_queue.call_count == 1 - - expected_result = { - "workflow_execution_id": 123, - "status": "error", - "error": "Error occurred", - "results": None, - } - assert result == expected_result - assert mock_logger.info.call_count == 2 + with patch.object(queue, "Queue", wraps=queue.Queue) as mock_queue: + result = workflow_scheduler.handle_workflow_test( + workflow=mock_workflow, + tenant_id=tenant_id, + triggered_by_user=triggered_by_user, + ) + + mock_workflow_manager._run_workflow.assert_called_once_with( + mock_workflow, 123, True + ) + + assert mock_queue.call_count == 1 + + expected_result = { + "workflow_execution_id": 123, + "status": "error", + "error": "Error occurred", + "results": None, + } + assert result == expected_result + assert mock_logger.info.call_count == 2 From 3c3cb8329e5729120f475f5f2874030dc4483587 Mon Sep 17 00:00:00 2001 From: Tal Date: Sat, 28 Dec 2024 13:35:47 +0200 Subject: [PATCH 07/21] feat: Grafana + Prometheus with auto-provisioning of Keep dashboard (#2914) Signed-off-by: Shahar Glazner Signed-off-by: Tal Co-authored-by: shahargl --- .gitignore | 3 + STRESS.md | 58 -- docker-compose.yml | 30 + grafana/dashboards/keep.json | 737 ++++++++++++++++++ grafana/provisioning/dashboards/keep.yml | 11 + .../provisioning/datasources/prometheus.yml | 9 + keep/api/api.py | 4 +- .../cilium_provider/cilium_provider.py | 14 +- keep/providers/providers_factory.py | 4 +- prometheus/prometheus.yml | 13 + 10 files changed, 818 insertions(+), 65 deletions(-) delete mode 100644 STRESS.md create mode 100644 grafana/dashboards/keep.json create mode 100644 grafana/provisioning/dashboards/keep.yml create mode 100644 grafana/provisioning/datasources/prometheus.yml create mode 100644 prometheus/prometheus.yml diff --git a/.gitignore b/.gitignore index 38d405e05..8b8a8989c 100644 --- a/.gitignore +++ b/.gitignore @@ -216,3 +216,6 @@ keepnew.db providers_cache.json tests/provision/* +grafana/* +!grafana/provisioning/ +!grafana/dashboards/ \ No newline at end of file diff --git a/STRESS.md b/STRESS.md deleted file mode 100644 index dd248cc22..000000000 --- a/STRESS.md +++ /dev/null @@ -1,58 +0,0 @@ - -# UNDER CONSTRUCTION - -# First, create a Kubernetes cluster - - -# Install Keep -gcloud config set project keep-dev-429814 -gcloud container clusters get-credentials keep-stress --zone us-central1-c --project keep-dev-429814 -helm repo add keephq https://keephq.github.io/helm-charts -helm pull keephq/keep -# create the namespace -kubectl create namespace keep -# install keep -helm install keep keephq/keep --namespace keep -# from local -helm install keep ./charts/keep --namespace keep - -kubectl -n keep describe pod keep-backend-697f6b946f-v2jxp -kubectl -n keep logs keep-frontend-577fdf5497-r8ht9 -# Import alerts - -# uninstall -helm uninstall keep --namespace keep - -kubectl -n keep exec -it keep-backend-64c4d7ddb7-7p5q5 /bin/bash -# copy the db -kubectl -n keep exec -it keep-database-86dd6b6775-92sz4 /bin/bash -kubectl -n keep cp ./keep.sql keep-database-659c69689-vxhkz:/tmp/keep.sql -kubectl -n keep exec -it keep-database-659c69689-vxhkz -- bash -c "mysql -u root keep < /tmp/keep.sql" -# exec into the pod -kubectl -n keep exec -it keep-database-86dd6b6775-92sz4 -- /bin/bash -# import -kubectl -n keep exec -it keep-database-659c69689-vxhkz -- bash -c "mysql -u root keep < /tmp/keep.sql" - -# No Load -## 500k alerts - 1Gi/250m cpu: get_last_alerts 2 minutes and 30 seconds -Keep Backend Workers get a timeout after one minute (status code 500 for preset and alert endpoints) -## 500k alerts - 2Gi/500m cpu: -- default mysql: get_last_alerts 1 minutes and 30 seconds -- innodb_buffer_pool_size = 4294967296: 25 seconds, 3 seconds after cache -## 500k alerts - 4Gi/1 cpu: get_last_alerts 2 minutes and 30 seconds -- -## 500k alerts - 8Gi/1 cpu: get_last_alerts 2 minutes and 30 seconds - -# Load 10 alerts per minute - -# Load 100 alerts per minute - -# Load 1000 alerts per minute - - -## 1M alerts -# Load 10 alerts per minute - -# Load 100 alerts per minute - -# Load 1000 alerts per minute diff --git a/docker-compose.yml b/docker-compose.yml index 68291e6b6..14b6001ad 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -26,3 +26,33 @@ services: extends: file: docker-compose.common.yml service: keep-websocket-server-common + + grafana: + image: grafana/grafana:latest + profiles: + - grafana + ports: + - "3001:3000" + volumes: + - ./grafana:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning + - ./grafana/dashboards:/etc/grafana/dashboards + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + depends_on: + - prometheus + + prometheus: + image: prom/prometheus:latest + profiles: + - grafana + ports: + - "9090:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + command: + - "--config.file=/etc/prometheus/prometheus.yml" + depends_on: + - keep-backend diff --git a/grafana/dashboards/keep.json b/grafana/dashboards/keep.json new file mode 100644 index 000000000..a94725b1f --- /dev/null +++ b/grafana/dashboards/keep.json @@ -0,0 +1,737 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "title": "Request Duration by Endpoint", + "type": "timeseries", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(keep_http_request_duration_seconds_sum{handler!=\"none\"}[5m]) / rate(keep_http_request_duration_seconds_count{handler!=\"none\"}[5m])", + "legendFormat": "{{handler}}" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(keep_running_tasks_current)", + "refId": "A" + } + ], + "title": "Running Tasks", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(keep_http_requests_total{status=~\"2..\"}[5m])) by (handler)", + "legendFormat": "{{handler}}", + "refId": "A" + } + ], + "title": "Request Rate by Endpoint (2xx)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(keep_events_in_total[5m])", + "legendFormat": "Events In", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(keep_events_processed_total[5m])", + "legendFormat": "Events Processed", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(keep_events_error_total[5m])", + "legendFormat": "Events Error", + "refId": "C" + } + ], + "title": "Events Processing Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 5, + "title": "Workflow Execution Duration", + "type": "timeseries", + "targets": [ + { + "expr": "rate(keep_workflows_execution_duration_seconds_sum[5m]) / rate(keep_workflows_execution_duration_seconds_count[5m])", + "legendFormat": "{{workflow_id}}" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 6, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "title": "Workflow Queue Size", + "type": "gauge", + "targets": [ + { + "expr": "keep_workflows_queue_size", + "legendFormat": "{{tenant_id}}" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 7, + "title": "Workflow Executions", + "type": "timeseries", + "targets": [ + { + "expr": "rate(keep_workflows_executions_total[5m])", + "legendFormat": "{{workflow_id}} ({{trigger_type}})" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": ["lastNotNull"], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "keep_events_in_total", + "legendFormat": "Total Events In", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "keep_events_processed_total", + "legendFormat": "Total Events Processed", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "keep_events_error_total", + "legendFormat": "Total Events Error", + "refId": "C" + } + ], + "title": "Total Events", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 9, + "options": { + "legend": { + "calcs": ["lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "keep_workflows_executions_total", + "legendFormat": "{{workflow_id}} ({{trigger_type}})", + "refId": "A" + } + ], + "title": "Total Workflow Executions", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["keep"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Keep Dashboard", + "uid": "keep", + "version": 1, + "weekStart": "" +} diff --git a/grafana/provisioning/dashboards/keep.yml b/grafana/provisioning/dashboards/keep.yml new file mode 100644 index 000000000..6213d6185 --- /dev/null +++ b/grafana/provisioning/dashboards/keep.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: "Keep" + orgId: 1 + folder: "" + type: file + disableDeletion: false + editable: true + options: + path: /etc/grafana/dashboards diff --git a/grafana/provisioning/datasources/prometheus.yml b/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 000000000..a221c3c37 --- /dev/null +++ b/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true diff --git a/keep/api/api.py b/keep/api/api.py index 7be28833a..ae1a44ac9 100644 --- a/keep/api/api.py +++ b/keep/api/api.py @@ -320,7 +320,9 @@ async def catch_exception(request: Request, exc: Exception): app.add_middleware(LoggingMiddleware) if config("KEEP_METRICS", default="true", cast=bool): - Instrumentator().instrument(app=app, metric_namespace="keep") + Instrumentator( + excluded_handlers=["/metrics", "/metrics/processing"] + ).instrument(app=app, metric_namespace="keep") keep.api.observability.setup(app) return app diff --git a/keep/providers/cilium_provider/cilium_provider.py b/keep/providers/cilium_provider/cilium_provider.py index e72d1c238..400133827 100644 --- a/keep/providers/cilium_provider/cilium_provider.py +++ b/keep/providers/cilium_provider/cilium_provider.py @@ -7,9 +7,6 @@ from keep.api.models.db.topology import TopologyServiceInDto from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseTopologyProvider -from keep.providers.cilium_provider.grpc.observer_pb2 import (FlowFilter, - GetFlowsRequest) -from keep.providers.cilium_provider.grpc.observer_pb2_grpc import ObserverStub from keep.providers.models.provider_config import ProviderConfig from keep.validation.fields import NoSchemeUrl @@ -24,7 +21,7 @@ class CiliumProviderAuthConfig: "description": "The base endpoint of the cilium hubble relay", "sensitive": False, "hint": "localhost:4245", - "validation": "no_scheme_url" + "validation": "no_scheme_url", } ) @@ -82,6 +79,15 @@ def _get_service_name(self, endpoint) -> str: return service def pull_topology(self) -> list[TopologyServiceInDto]: + # for some providers that depends on grpc like cilium provider, this might fail on imports not from Keep (such as the docs script) + from keep.providers.cilium_provider.grpc.observer_pb2 import ( # noqa + FlowFilter, + GetFlowsRequest, + ) + from keep.providers.cilium_provider.grpc.observer_pb2_grpc import ( # noqa + ObserverStub, + ) + channel = grpc.insecure_channel(self.authentication_config.cilium_base_endpoint) stub = ObserverStub(channel) diff --git a/keep/providers/providers_factory.py b/keep/providers/providers_factory.py index 3a2ec4131..2cea90713 100644 --- a/keep/providers/providers_factory.py +++ b/keep/providers/providers_factory.py @@ -414,9 +414,9 @@ def get_all_providers(ignore_cache_file: bool = False) -> list[Provider]: ) continue # for some providers that depends on grpc like cilium provider, this might fail on imports not from Keep (such as the docs script) - except TypeError: + except TypeError as e: logger.warning( - f"Cannot import provider {provider_directory}, unexpected error." + f"Cannot import provider {provider_directory}, unexpected error. ({str(e)})" ) continue diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml new file mode 100644 index 000000000..c3a5987a0 --- /dev/null +++ b/prometheus/prometheus.yml @@ -0,0 +1,13 @@ +global: + scrape_interval: 5s + evaluation_interval: 5s + +scrape_configs: + - job_name: "keep" + static_configs: + - targets: ["keep-backend:8080"] + metrics_path: "/metrics/processing" + http_headers: + x-api-key: + values: + - "keep-api-key" From edcabafa4c85c8c5dc9330cc5287e62cfb6c0274 Mon Sep 17 00:00:00 2001 From: Shahar Glazner Date: Sun, 29 Dec 2024 10:52:25 +0200 Subject: [PATCH 08/21] fix(ui): keycloak (#2921) --- keep-ui/auth.config.ts | 4 ++++ pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/keep-ui/auth.config.ts b/keep-ui/auth.config.ts index 5ec968059..52d632237 100644 --- a/keep-ui/auth.config.ts +++ b/keep-ui/auth.config.ts @@ -225,6 +225,10 @@ export const config = { if ((profile as any)?.keep_role) { role = (profile as any).keep_role; } + } else if (authType === AuthType.KEYCLOAK) { + // TODO: remove this once we have a proper way to get the tenant id + tenantId = (profile as any).keep_tenant_id || "keep"; + role = (profile as any).keep_role; } else { accessToken = user.accessToken || account.access_token || account.id_token; diff --git a/pyproject.toml b/pyproject.toml index c69b12a7e..387b6bb84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "keep" -version = "0.33.1" +version = "0.33.2" description = "Alerting. for developers, by developers." authors = ["Keep Alerting LTD"] packages = [{include = "keep"}] From 3e3a0e64de306f85587efeda04bc49f15f6e1a65 Mon Sep 17 00:00:00 2001 From: Tal Date: Sun, 29 Dec 2024 13:02:19 +0200 Subject: [PATCH 09/21] fix(mapping): when using AND in matcher (#2919) --- keep/api/bl/enrichments_bl.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/keep/api/bl/enrichments_bl.py b/keep/api/bl/enrichments_bl.py index 08ab95855..3342cbc03 100644 --- a/keep/api/bl/enrichments_bl.py +++ b/keep/api/bl/enrichments_bl.py @@ -44,7 +44,11 @@ def get_nested_attribute(obj: AlertDto, attr_path: str): # We can access it by using "results.some@@attribute" so we won't think its a nested attribute if attr is not None and "@@" in attr: attr = attr.replace("@@", ".") - obj = getattr(obj, attr, obj.get(attr, None) if isinstance(obj, dict) else None) + obj = getattr( + obj, + attr, + obj.get(attr, None) if isinstance(obj, dict) else None, + ) if obj is None: return None return obj @@ -315,6 +319,8 @@ def _check_alert_matches_rule(self, alert: AlertDto, rule: MappingRule) -> bool: is_matcher = True break if not is_matcher: + # If the key has . (dot) in it, it'll be added as is while it needs to be nested. + # @tb: fix when somebody will be complaining about this. enrichments[key] = value break @@ -370,15 +376,17 @@ def _check_matcher(self, alert: AlertDto, row: dict, matcher: str) -> bool: - bool: True if alert matches the matcher, False otherwise. """ try: - if " && " in matcher: - # Split by " && " for AND condition - conditions = matcher.split(" && ") + if "&&" in matcher: + # Split by "&&" for AND condition + conditions = matcher.split("&&") return all( self._is_match( - get_nested_attribute(alert, attribute), row.get(attribute) + get_nested_attribute(alert, attribute.strip()), + row.get(attribute.strip()), ) - or get_nested_attribute(alert, attribute) == row.get(attribute) - or row.get(attribute) == "*" # Wildcard match + or get_nested_attribute(alert, attribute.strip()) + == row.get(attribute.strip()) + or row.get(attribute.strip()) == "*" # Wildcard match for attribute in conditions ) else: From a04c6b10b57dfc28654df5f181ae7b6dbcbb2989 Mon Sep 17 00:00:00 2001 From: Kirill Chernakov Date: Sun, 29 Dec 2024 18:54:01 +0400 Subject: [PATCH 10/21] fix: redirect on 401 (#2923) --- keep-ui/app/(keep)/providers/page.client.tsx | 20 +- keep-ui/app/(signin)/layout.tsx | 32 ++- keep-ui/app/(signin)/signin/SignInForm.tsx | 194 +++++++----------- .../incident-list/ui/incident-list-error.tsx | 6 +- keep-ui/{middleware.tsx => middleware.ts} | 44 ++-- keep-ui/shared/api/ApiClient.ts | 13 +- .../api/server/createServerApiClient.ts | 3 +- keep-ui/shared/lib/hooks/useApi.tsx | 2 +- tests/e2e_tests/test_end_to_end.py | 4 +- 9 files changed, 165 insertions(+), 153 deletions(-) rename keep-ui/{middleware.tsx => middleware.ts} (66%) diff --git a/keep-ui/app/(keep)/providers/page.client.tsx b/keep-ui/app/(keep)/providers/page.client.tsx index 99a1b7951..ede753034 100644 --- a/keep-ui/app/(keep)/providers/page.client.tsx +++ b/keep-ui/app/(keep)/providers/page.client.tsx @@ -7,6 +7,10 @@ import { useFilterContext } from "./filter-context"; import { toast } from "react-toastify"; import { useProviders } from "@/utils/hooks/useProviders"; import { showErrorToast } from "@/shared/ui"; +import { Link } from "@/components/ui"; + +const EXTERNAL_URL_DOCS_URL = + "https://docs.keephq.dev/development/external-url"; export const useFetchProviders = () => { const [providers, setProviders] = useState([]); @@ -25,8 +29,14 @@ export const useFetchProviders = () => {
Webhooks are disabled because Keep is not accessible from the internet.
-
- Click for Keep docs on how to enabled it 📚 + + Read docs + {" "} + to learn how to enable it.
); @@ -38,11 +48,7 @@ export const useFetchProviders = () => { type: "info", position: toast.POSITION.TOP_CENTER, autoClose: 10000, - onClick: () => - window.open( - "https://docs.keephq.dev/development/external-url", - "_blank" - ), + onClick: () => window.open(EXTERNAL_URL_DOCS_URL, "_blank"), style: { width: "250%", // Set width marginLeft: "-75%", // Adjust starting position to left diff --git a/keep-ui/app/(signin)/layout.tsx b/keep-ui/app/(signin)/layout.tsx index c535b8c48..75a3a11ce 100644 --- a/keep-ui/app/(signin)/layout.tsx +++ b/keep-ui/app/(signin)/layout.tsx @@ -1,3 +1,6 @@ +import { Card, Text } from "@tremor/react"; +import Image from "next/image"; + export const metadata = { title: "Keep", description: "The open-source alert management and AIOps platform", @@ -9,8 +12,33 @@ export default function RootLayout({ children: React.ReactNode; }) { return ( - - {children} + + +
+
+
+ Keep Logo + + Keep + +
+ + {children} + +
+
+ ); } diff --git a/keep-ui/app/(signin)/signin/SignInForm.tsx b/keep-ui/app/(signin)/signin/SignInForm.tsx index e5a4ef1f4..faf6d1553 100644 --- a/keep-ui/app/(signin)/signin/SignInForm.tsx +++ b/keep-ui/app/(signin)/signin/SignInForm.tsx @@ -1,13 +1,12 @@ "use client"; import { signIn, getProviders } from "next-auth/react"; -import { Text, TextInput, Button, Card } from "@tremor/react"; -import Image from "next/image"; +import { Text, TextInput, Button } from "@tremor/react"; import { useEffect, useState } from "react"; import { useForm } from "react-hook-form"; -import "../../globals.css"; import { authenticate, revalidateAfterAuth } from "@/app/actions/authactions"; import { useRouter } from "next/navigation"; +import "../../globals.css"; export interface Provider { id: string; @@ -109,129 +108,92 @@ export default function SignInForm({ params }: { params?: { amt: string } }) { // Show loading state during redirect if (isRedirecting) { return ( -
- -
-
- Keep Logo -
- - Authentication successful, redirecting... - -
-
-
+ + Authentication successful, redirecting... + ); } if (providers?.credentials) { return ( -
- -
-
- Keep Logo + <> + + Log in to your account + + +
+ {errors.root && ( +
+ + {errors.root.message} +
- - - Sign in to Keep + )} +
+ + Username + + {errors.username && ( + + {errors.username.message} + + )} +
- -
- - Username - - - {errors.username && ( - - {errors.username.message} - - )} -
- -
- - Password - - - {errors.password && ( - - {errors.password.message} - - )} -
- - - - {errors.root && ( -
- - {errors.root.message} - -
- )} - +
+ + Password + + + {errors.password && ( + + {errors.password.message} + + )}
- -
+ + + + ); } - return <>Redirecting to authentication...; + return ( + + Redirecting to authentication... + + ); } diff --git a/keep-ui/features/incident-list/ui/incident-list-error.tsx b/keep-ui/features/incident-list/ui/incident-list-error.tsx index 2d91c42cc..b8bb6f455 100644 --- a/keep-ui/features/incident-list/ui/incident-list-error.tsx +++ b/keep-ui/features/incident-list/ui/incident-list-error.tsx @@ -19,7 +19,11 @@ export const IncidentListError = ({
Failed to load incidents - Please try again. If the issue persists, contact us + Error: {incidentError.message} + + + {incidentError.proposedResolution || + "Please try again. If the issue persists, contact us"}