From dc59681f657be6a7ce01539bc03e85f781e0658d Mon Sep 17 00:00:00 2001 From: Vladimir Filonov Date: Sat, 7 Dec 2024 23:15:39 +0400 Subject: [PATCH] Add tests for history-based rules --- keep/api/core/db.py | 2 +- keep/api/models/db/rule.py | 2 +- keep/rulesengine/rulesengine.py | 23 +++-- tests/test_rules_engine.py | 148 +++++++++++++++++++++++++++++++- 4 files changed, 158 insertions(+), 17 deletions(-) diff --git a/keep/api/core/db.py b/keep/api/core/db.py index ff2f22417..d29b785c7 100644 --- a/keep/api/core/db.py +++ b/keep/api/core/db.py @@ -4281,7 +4281,7 @@ def is_all_alerts_in_status( session: Optional[Session] = None ): - if incident.alerts_count == 0: + if incident and incident.alerts_count == 0: return False with existed_or_new_session(session) as session: diff --git a/keep/api/models/db/rule.py b/keep/api/models/db/rule.py index edef770e4..4b626df1b 100644 --- a/keep/api/models/db/rule.py +++ b/keep/api/models/db/rule.py @@ -77,4 +77,4 @@ def add_alert(self, condition, fingerprint): flag_modified(self, "state") def get_all_alerts(self): - return list(set(chain(*self.state.values()))) \ No newline at end of file + return list(set(chain(*self.state.values()))) diff --git a/keep/rulesengine/rulesengine.py b/keep/rulesengine/rulesengine.py index b08381547..200097930 100644 --- a/keep/rulesengine/rulesengine.py +++ b/keep/rulesengine/rulesengine.py @@ -86,8 +86,14 @@ def run_rules( rule_fingerprint, session=session, ) - - if not incident: + if incident: + incident = assign_alert_to_incident( + fingerprint=event.fingerprint, + incident=incident, + tenant_id=self.tenant_id, + session=session, + ) + else: self.logger.info( f"No existing incidents for rule {rule.name}. Checking incident creation conditions" @@ -100,22 +106,13 @@ def run_rules( incident = self._create_incident_with_alerts( rule, rule_fingerprint, [event.fingerprint], session=session ) - incidents_dto[incident.id] = IncidentDto.from_db_incident(incident) - elif rule.create_on == "all": incident = self._process_event_for_history_based_rule( event, rule, sub_rule, rule_groups, rule_fingerprint, session ) - if incident: - incidents_dto[incident.id] = IncidentDto.from_db_incident(incident) - else: - incident = assign_alert_to_incident( - fingerprint=event.fingerprint, - incident=incident, - tenant_id=self.tenant_id, - session=session, - ) + if incident: + incident = self._resolve_incident_if_require(rule, incident, session) incidents_dto[incident.id] = IncidentDto.from_db_incident(incident) diff --git a/tests/test_rules_engine.py b/tests/test_rules_engine.py index c7d362620..235b07ce9 100644 --- a/tests/test_rules_engine.py +++ b/tests/test_rules_engine.py @@ -3,9 +3,11 @@ import json import os import uuid +from time import sleep import pytest +from boom import fingerprint from keep.api.core.db import create_rule as create_rule_db from keep.api.core.db import get_incident_alerts_by_incident_id, get_last_incidents, set_last_alert from keep.api.core.db import get_rules as get_rules_db @@ -17,8 +19,8 @@ IncidentSeverity, IncidentStatus, ) -from keep.api.models.db.alert import Alert -from keep.api.models.db.rule import ResolveOn +from keep.api.models.db.alert import Alert, Incident +from keep.api.models.db.rule import ResolveOn, CreateIncidentOn, RuleEventGroup from keep.rulesengine.rulesengine import RulesEngine @@ -582,6 +584,148 @@ def test_incident_resolution_on_edge( assert incident.status == IncidentStatus.RESOLVED.value +def test_rule_event_groups(db_session, create_alert): + + create_rule_db( + tenant_id=SINGLE_TENANT_UUID, + name="test-rule", + definition={ + "sql": "N/A", # we don't use it anymore + "params": {}, + }, + timeframe=600, + timeunit="seconds", + definition_cel='(severity == "critical") || (severity == "high")', + created_by="test@keephq.dev", + create_on=CreateIncidentOn.ALL.value, + ) + + create_alert( + "Critical Alert", + AlertStatus.FIRING, + datetime.datetime.utcnow(), + { + "severity": AlertSeverity.CRITICAL.value, + }, + ) + + # No incident yet + assert db_session.query(Incident).count() == 0 + # But RuleEventGroup + assert db_session.query(RuleEventGroup).count() == 1 + event_group = db_session.query(RuleEventGroup).first() + alert_1 = db_session.query(Alert).order_by(Alert.timestamp.desc()).first() + + assert isinstance(event_group.state, dict) + assert 'severity == "critical"' in event_group.state + assert len(event_group.state['severity == "critical"']) == 1 + assert event_group.state['severity == "critical"'][0] == alert_1.fingerprint + + create_alert( + "Critical Alert 2", + AlertStatus.FIRING, + datetime.datetime.utcnow(), + { + "severity": AlertSeverity.CRITICAL.value, + }, + ) + + db_session.refresh(event_group) + alert_2 = db_session.query(Alert).order_by(Alert.timestamp.desc()).first() + + # Still no incident yet + assert db_session.query(Incident).count() == 0 + # And still one RuleEventGroup + assert db_session.query(RuleEventGroup).count() == 1 + + assert isinstance(event_group.state, dict) + assert 'severity == "critical"' in event_group.state + assert len(event_group.state['severity == "critical"']) == 2 + assert event_group.state['severity == "critical"'][0] == alert_1.fingerprint + assert event_group.state['severity == "critical"'][1] == alert_2.fingerprint + + create_alert( + "High Alert", + AlertStatus.FIRING, + datetime.datetime.utcnow(), + { + "severity": AlertSeverity.HIGH.value, + }, + ) + alert_3 = db_session.query(Alert).order_by(Alert.timestamp.desc()).first() + + # RuleEventGroup was removed + assert db_session.query(RuleEventGroup).count() == 0 + + # And incident was started + assert db_session.query(Incident).count() == 1 + + incident = db_session.query(Incident).first() + assert incident.alerts_count == 3 + + alerts, alert_count = get_incident_alerts_by_incident_id( + tenant_id=SINGLE_TENANT_UUID, + incident_id=str(incident.id), + session=db_session, + ) + assert alert_count == 3 + assert len(alerts) == 3 + + fingerprints = [a.fingerprint for a in alerts] + + assert alert_1.fingerprint in fingerprints + assert alert_2.fingerprint in fingerprints + assert alert_3.fingerprint in fingerprints + + +def test_rule_event_groups_expires(db_session, create_alert): + + create_rule_db( + tenant_id=SINGLE_TENANT_UUID, + name="test-rule", + definition={ + "sql": "N/A", # we don't use it anymore + "params": {}, + }, + timeframe=1, + timeunit="seconds", + definition_cel='(severity == "critical") || (severity == "high")', + created_by="test@keephq.dev", + create_on=CreateIncidentOn.ALL.value, + ) + + create_alert( + "Critical Alert", + AlertStatus.FIRING, + datetime.datetime.utcnow(), + { + "severity": AlertSeverity.CRITICAL.value, + }, + ) + + # No incident yet + assert db_session.query(Incident).count() == 0 + # One RuleEventGroup + assert db_session.query(RuleEventGroup).count() == 1 + + sleep(1) + + create_alert( + "High Alert", + AlertStatus.FIRING, + datetime.datetime.utcnow(), + { + "severity": AlertSeverity.HIGH.value, + }, + ) + + # Still no incident + assert db_session.query(Incident).count() == 0 + # And now two RuleEventGroup - first one was expired + assert db_session.query(RuleEventGroup).count() == 2 + + + # Next steps: # - test that alerts in the same group are being updated correctly # - test group are being updated correctly