From e5001e9c723cfd3dbab4cc0c5cb9ab2943cec6f3 Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Mon, 18 Nov 2024 13:21:46 +0200 Subject: [PATCH 01/28] Demo mode --- keep/api/config.py | 2 + keep/api/core/demo_mode_runner.py | 186 ++++++++++++++++++++++++++++++ scripts/simulate_alerts.py | 62 ++-------- 3 files changed, 199 insertions(+), 51 deletions(-) create mode 100644 keep/api/core/demo_mode_runner.py diff --git a/keep/api/config.py b/keep/api/config.py index c420860d0..d1e561479 100644 --- a/keep/api/config.py +++ b/keep/api/config.py @@ -5,6 +5,7 @@ from keep.api.api import AUTH_TYPE from keep.api.core.db_on_start import migrate_db, try_create_single_tenant from keep.api.core.report_uptime import launch_uptime_reporting +from keep.api.core.demo_mode_runner import launch_demo_mode from keep.api.core.dependencies import SINGLE_TENANT_UUID from keep.identitymanager.identitymanagerfactory import IdentityManagerTypes @@ -20,6 +21,7 @@ def on_starting(server=None): migrate_db() launch_uptime_reporting() + launch_demo_mode() # Create single tenant if it doesn't exist if AUTH_TYPE in [ diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py new file mode 100644 index 000000000..9fc03a757 --- /dev/null +++ b/keep/api/core/demo_mode_runner.py @@ -0,0 +1,186 @@ +import os +from fastapi import Depends +import requests +import asyncio +import logging +import threading +import random +import time, datetime +from datetime import timezone + +from keep.api.core.db import get_session, get_session_sync +from keep.api.core.dependencies import SINGLE_TENANT_UUID +from keep.api.utils.tenant_utils import get_or_create_api_key +from keep.providers.providers_factory import ProvidersFactory + +logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s %(levelname)s %(name)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) + +logger = logging.getLogger(__name__) + +LIVE_DEMO_MODE = os.environ.get("LIVE_DEMO_MODE", "false").lower() == "true" + +incidents = [ + {"name": "Performance issue, CPU, DB, UI impacted", "severity": "critical"}, + { + "name": "Message queue bucles up", + "severity": "warning", + }, +] + +correlation_rules_to_create = [ + { + "sqlQuery": { + "sql": "((name like :name_1))", + "params": { + "name_1": "%mq%" + } + }, + "groupDescription": "This rule groups all alerts related to MQ.", + "ruleName": "Message Queue Bucle Up", + "celQuery": "(name.contains(\"mq\"))", + "timeframeInSeconds": 86400, + "timeUnit": "hours", + "groupingCriteria": [], + "requireApprove": False, + "resolveOn": "never" + }, + { + "sqlQuery": { + "sql": "((name like :name_1) or (name = :name_2) or (name like :name_3))", + "params": { + "name_1": "%network_latency_high%", + "name_1": "high_cpu_usage", + "name_3": "%database_connection_failure%" + } + }, + "groupDescription": "This rule groups alerts from multiple sources.", + "ruleName": "Application issue caused by DB load", + "celQuery": "(name.contains(\"network_latency_high\")) || (name == \"high_cpu_usage\") || (name.contains(\"database_connection_failure\"))", + "timeframeInSeconds": 86400, + "timeUnit": "hours", + "groupingCriteria": [], + "requireApprove": False, + "resolveOn": "never" + }, +] + + +def get_or_create_correlation_rules(keep_api_key, keep_api_url): + correlation_rules_existing = requests.get( + f"{keep_api_url}/rules", + headers={"x-api-key": keep_api_key}, + ) + correlation_rules_existing.raise_for_status() + correlation_rules_existing = correlation_rules_existing.json() + + if len(correlation_rules_existing) == 0: + for correlation_rule in correlation_rules_to_create: + response = requests.post( + f"{keep_api_url}/rules", + headers={"x-api-key": keep_api_key}, + json=correlation_rule, + ) + response.raise_for_status() + + +def remove_old_incidents(keep_api_key, keep_api_url): + incidents_existing = requests.get( + f"{keep_api_url}/incidents", + headers={"x-api-key": keep_api_key}, + ) + incidents_existing.raise_for_status() + incidents_existing = incidents_existing.json()['items'] + + for incident in incidents_existing: + if datetime.datetime.strptime( + incident["creation_time"], "%Y-%m-%dT%H:%M:%S.%f" + ).replace(tzinfo=timezone.utc) < (datetime.datetime.now() - datetime.timedelta(minutes=30)).astimezone(timezone.utc): + incident_id = incident["id"] + response = requests.delete( + f"{keep_api_url}/incidents/{incident_id}", + headers={"x-api-key": keep_api_key}, + ) + response.raise_for_status() + + +async def simulate_alerts(keep_api_url=None, keep_api_key=None, sleep_interval=5, demo_correlation_rules=False): + GENERATE_DEDUPLICATIONS = True + + providers = ["prometheus", "grafana"] + + provider_classes = { + provider: ProvidersFactory.get_provider_class(provider) + for provider in providers + } + + # Wait in the beginning because server may not be ready yet. + await asyncio.sleep(sleep_interval * 2) + + get_or_create_correlation_rules(keep_api_key, keep_api_url) + + while True: + await asyncio.sleep(sleep_interval) + + remove_old_incidents(keep_api_key, keep_api_url) + + # choose provider + provider_type = random.choice(providers) + send_alert_url = "{}/alerts/event/{}".format( + keep_api_url, provider_type) + provider = provider_classes[provider_type] + alert = provider.simulate_alert() + + # Determine number of times to send the same alert + num_iterations = 1 + if GENERATE_DEDUPLICATIONS: + num_iterations = random.randint(1, 3) + + for _ in range(num_iterations): + logger.info("Sending alert: {}".format(alert)) + try: + env = random.choice(["production", "staging", "development"]) + response = requests.post( + send_alert_url + f"?provider_id={provider_type}-{env}", + headers={"x-api-key": keep_api_key}, + json=alert, + ) + response.raise_for_status() # Raise an HTTPError for bad responses + except requests.exceptions.RequestException as e: + logger.error("Failed to send alert: {}".format(e)) + time.sleep(SLEEP_INTERVAL) + continue + + if response.status_code != 202: + logger.error("Failed to send alert: {}".format(response.text)) + else: + logger.info("Alert sent successfully") + +def launch_demo_mode(): + """ + Running async demo in the backgound. + """ + keep_api_url = "http://localhost:" + str(os.environ.get("PORT", 8080)) + keep_api_key = get_or_create_api_key( + session=get_session_sync(), + tenant_id=SINGLE_TENANT_UUID, + created_by="system", + unique_api_key_id="simulate_alerts", + system_description="Simulate Alerts API key", + ) + sleep_interval = 5 + + if LIVE_DEMO_MODE: + thread = threading.Thread(target=asyncio.run, args=(simulate_alerts( + keep_api_url, + keep_api_key, + sleep_interval, + demo_correlation_rules=True + ), )) + thread.start() + logger.info("Simulate Alert launched.") + else: + logger.info("Alert simulation is disabled.") diff --git a/scripts/simulate_alerts.py b/scripts/simulate_alerts.py index fa1d0cc35..abf7dafa1 100644 --- a/scripts/simulate_alerts.py +++ b/scripts/simulate_alerts.py @@ -1,13 +1,9 @@ -import logging import os -import random -import time - -import requests +import logging +import asyncio -from keep.providers.providers_factory import ProvidersFactory +from keep.api.core.demo_mode_runner import simulate_alerts -# configure logging logging.basicConfig( level=logging.DEBUG, format="%(asctime)s %(levelname)s %(name)s %(message)s", @@ -17,53 +13,17 @@ logger = logging.getLogger(__name__) -def main(): - GENERATE_DEDUPLICATIONS = True +async def main(): SLEEP_INTERVAL = float(os.environ.get("SLEEP_INTERVAL", 0.2)) # Configurable sleep interval from env variable keep_api_key = os.environ.get("KEEP_API_KEY") keep_api_url = os.environ.get("KEEP_API_URL") or "http://localhost:8080" - if keep_api_key is None or keep_api_url is None: - raise Exception("KEEP_API_KEY and KEEP_API_URL must be set") - - providers = ["prometheus", "grafana"] - provider_classes = { - provider: ProvidersFactory.get_provider_class(provider) - for provider in providers - } - while True: - # choose provider - provider_type = random.choice(providers) - send_alert_url = "{}/alerts/event/{}".format(keep_api_url, provider_type) - provider = provider_classes[provider_type] - alert = provider.simulate_alert() - - # Determine number of times to send the same alert - num_iterations = 1 - if GENERATE_DEDUPLICATIONS: - num_iterations = random.randint(1, 3) - - for _ in range(num_iterations): - logger.info("Sending alert: {}".format(alert)) - try: - env = random.choice(["production", "staging", "development"]) - response = requests.post( - send_alert_url + f"?provider_id={provider_type}-{env}", - headers={"x-api-key": keep_api_key}, - json=alert, - ) - response.raise_for_status() # Raise an HTTPError for bad responses - except requests.exceptions.RequestException as e: - logger.error("Failed to send alert: {}".format(e)) - time.sleep(SLEEP_INTERVAL) - continue - - if response.status_code != 202: - logger.error("Failed to send alert: {}".format(response.text)) - else: - logger.info("Alert sent successfully") - - time.sleep(SLEEP_INTERVAL) # Wait for the configured interval before sending the next alert + await simulate_alerts( + keep_api_key=keep_api_key, + keep_api_url=keep_api_url, + sleep_interval=SLEEP_INTERVAL, + demo_correlation_rules=False, + ) if __name__ == "__main__": - main() + asyncio.run(main()) From 31a3e885cb74bee78a45cba945269f5e6fe4796a Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Mon, 18 Nov 2024 13:24:21 +0200 Subject: [PATCH 02/28] Docs --- docs/deployment/configuration.mdx | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/deployment/configuration.mdx b/docs/deployment/configuration.mdx index eaa04d439..e66f9eb0b 100644 --- a/docs/deployment/configuration.mdx +++ b/docs/deployment/configuration.mdx @@ -25,6 +25,7 @@ General configuration variables control the core behavior of the Keep server. Th | **KEEP_API_URL** | Specifies the Keep API URL | No | Constructed from HOST and PORT | Valid URL | | **KEEP_STORE_RAW_ALERTS** | Enables storing of raw alerts | No | "false" | "true" or "false" | | **TENANT_CONFIGURATION_RELOAD_TIME** | Time in minutes to reload tenant configurations | No | 5 | Positive integer | +| **LIVE_DEMO_MODE** | Keep will simulate incoming alerts and other activity | No | "false" | "true" or "false" | ### Logging and Environment From c6824913e7045b3bd537c01d7f3bbd583a99c7d4 Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Mon, 18 Nov 2024 13:28:05 +0200 Subject: [PATCH 03/28] Fix --- keep/api/core/demo_mode_runner.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index 9fc03a757..15bd7da2f 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -1,5 +1,4 @@ import os -from fastapi import Depends import requests import asyncio import logging @@ -8,7 +7,7 @@ import time, datetime from datetime import timezone -from keep.api.core.db import get_session, get_session_sync +from keep.api.core.db import get_session_sync from keep.api.core.dependencies import SINGLE_TENANT_UUID from keep.api.utils.tenant_utils import get_or_create_api_key from keep.providers.providers_factory import ProvidersFactory @@ -53,7 +52,7 @@ "sql": "((name like :name_1) or (name = :name_2) or (name like :name_3))", "params": { "name_1": "%network_latency_high%", - "name_1": "high_cpu_usage", + "name_2": "high_cpu_usage", "name_3": "%database_connection_failure%" } }, @@ -151,7 +150,7 @@ async def simulate_alerts(keep_api_url=None, keep_api_key=None, sleep_interval=5 response.raise_for_status() # Raise an HTTPError for bad responses except requests.exceptions.RequestException as e: logger.error("Failed to send alert: {}".format(e)) - time.sleep(SLEEP_INTERVAL) + time.sleep(sleep_interval) continue if response.status_code != 202: From c63e46902d9fd0fb07c7edcf5fdf83ee71c3462c Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Mon, 18 Nov 2024 13:36:13 +0200 Subject: [PATCH 04/28] up --- keep/api/core/demo_mode_runner.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index 15bd7da2f..b46e4910d 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -22,14 +22,6 @@ LIVE_DEMO_MODE = os.environ.get("LIVE_DEMO_MODE", "false").lower() == "true" -incidents = [ - {"name": "Performance issue, CPU, DB, UI impacted", "severity": "critical"}, - { - "name": "Message queue bucles up", - "severity": "warning", - }, -] - correlation_rules_to_create = [ { "sqlQuery": { @@ -39,7 +31,7 @@ } }, "groupDescription": "This rule groups all alerts related to MQ.", - "ruleName": "Message Queue Bucle Up", + "ruleName": "Message Queue Buckle Up", "celQuery": "(name.contains(\"mq\"))", "timeframeInSeconds": 86400, "timeUnit": "hours", @@ -163,20 +155,19 @@ def launch_demo_mode(): Running async demo in the backgound. """ keep_api_url = "http://localhost:" + str(os.environ.get("PORT", 8080)) - keep_api_key = get_or_create_api_key( + keep_api_key = os.environ.get("KEEP_API_KEY", get_or_create_api_key( session=get_session_sync(), tenant_id=SINGLE_TENANT_UUID, created_by="system", unique_api_key_id="simulate_alerts", system_description="Simulate Alerts API key", - ) - sleep_interval = 5 + )) if LIVE_DEMO_MODE: thread = threading.Thread(target=asyncio.run, args=(simulate_alerts( keep_api_url, keep_api_key, - sleep_interval, + sleep_interval=5, demo_correlation_rules=True ), )) thread.start() From 7daf2ce793fc10478c728c3ba886edbb1e0d4dc2 Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Mon, 18 Nov 2024 13:38:37 +0200 Subject: [PATCH 05/28] minor --- keep/api/core/demo_mode_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index b46e4910d..d52144294 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -79,6 +79,7 @@ def get_or_create_correlation_rules(keep_api_key, keep_api_url): def remove_old_incidents(keep_api_key, keep_api_url): + consider_old_timedelta = datetime.timedelta(minutes=30) incidents_existing = requests.get( f"{keep_api_url}/incidents", headers={"x-api-key": keep_api_key}, @@ -89,7 +90,7 @@ def remove_old_incidents(keep_api_key, keep_api_url): for incident in incidents_existing: if datetime.datetime.strptime( incident["creation_time"], "%Y-%m-%dT%H:%M:%S.%f" - ).replace(tzinfo=timezone.utc) < (datetime.datetime.now() - datetime.timedelta(minutes=30)).astimezone(timezone.utc): + ).replace(tzinfo=timezone.utc) < (datetime.datetime.now() - consider_old_timedelta).astimezone(timezone.utc): incident_id = incident["id"] response = requests.delete( f"{keep_api_url}/incidents/{incident_id}", From 9b8c52b76879814ae596104cefc04096d334dcd7 Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Mon, 18 Nov 2024 13:39:39 +0200 Subject: [PATCH 06/28] fix --- keep/api/core/demo_mode_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index d52144294..374397159 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -4,7 +4,8 @@ import logging import threading import random -import time, datetime +import time +import datetime from datetime import timezone from keep.api.core.db import get_session_sync From 5bc2ed94499c88f2f9e69c2923200242f197dc60 Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Mon, 18 Nov 2024 14:57:28 +0200 Subject: [PATCH 07/28] Read or Create topology --- keep/api/core/demo_mode_runner.py | 224 +++++++++++++++++- .../providers/grafana_provider/alerts_mock.py | 3 + .../prometheus_provider/alerts_mock.py | 9 +- 3 files changed, 229 insertions(+), 7 deletions(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index 374397159..3f4b4f74e 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -10,6 +10,8 @@ from keep.api.core.db import get_session_sync from keep.api.core.dependencies import SINGLE_TENANT_UUID +from keep.api.models.db.topology import TopologyServiceInDto +from keep.api.tasks.process_topology_task import process_topology from keep.api.utils.tenant_utils import get_or_create_api_key from keep.providers.providers_factory import ProvidersFactory @@ -60,6 +62,211 @@ }, ] +services_to_create = [ + TopologyServiceInDto( + source_provider_id="Prod-Datadog", + repository="keephq/keep", + tags=[], + service="api", + display_name="API Service", + environment="prod", + description="The main API service", + team="keep", + email="support@keephq.dev", + slack="https://slack.keephq.dev", + ip_address="10.0.0.1", + mac_address="", + category="Python", + manufacturer="", + dependencies={ + "db": "SQL", + "queue": "AMQP", + }, + application_ids=[], + updated_at="2024-11-18T09:23:46" + ), + TopologyServiceInDto( + source_provider_id="Prod-Datadog", + repository="keephq/keep", + tags=[], + service="ui", + display_name="Platform", + environment="prod", + description="The user interface (aka Platform)", + team="keep", + email="support@keephq.dev", + slack="https://slack.keephq.dev", + ip_address="10.0.0.2", + mac_address="", + category="nextjs", + manufacturer="", + dependencies={ + "api": "HTTP/S", + }, + application_ids=[], + updated_at="2024-11-18T09:29:25" + ), + TopologyServiceInDto( + source_provider_id="Prod-Datadog", + repository="keephq/keep", + tags=[], + service="db", + display_name="DB", + environment="prod", + description="Production Database", + team="keep", + email="support@keephq.dev", + slack="https://slack.keephq.dev", + ip_address="10.0.0.3", + mac_address="", + category="postgres", + manufacturer="", + dependencies={}, + application_ids=[], + updated_at="2024-11-18T09:30:44" + ), + TopologyServiceInDto( + source_provider_id="Prod-Datadog", + repository="keephq/keep", + tags=[], + service="queue", + display_name="Kafka", + environment="prod", + description="Production Queue", + team="keep", + email="support@keephq.dev", + slack="https://slack.keephq.dev", + ip_address="10.0.0.4", + mac_address="", + category="Kafka", + manufacturer="", + dependencies={ + "processor": "AMQP", + }, + application_ids=[], + updated_at="2024-11-18T09:31:31" + ), + TopologyServiceInDto( + source_provider_id="Prod-Datadog", + repository="keephq/keep", + tags=[], + service="processor", + display_name="Processor", + environment="prod", + description="Processing Service", + team="keep", + email="support@keephq.dev", + slack="https://slack.keephq.dev", + ip_address="10.0.0.5", + mac_address="", + category="go", + manufacturer="", + dependencies={ + "storage": "HTTP/S", + }, + application_ids=[], + updated_at="2024-11-18T10:02:20" + ), + TopologyServiceInDto( + source_provider_id="Prod-Datadog", + repository="keephq/keep", + tags=[], + service="backoffice", + display_name="Backoffice", + environment="prod", + description="Backoffice UI to control configuration", + team="keep", + email="support@keephq.dev", + slack="https://slack.keephq.dev", + ip_address="172.1.1.0", + mac_address="", + category="nextjs", + manufacturer="", + dependencies={ + "api": "HTTP/S", + }, + application_ids=[], + updated_at="2024-11-18T10:11:31" + ), + TopologyServiceInDto( + source_provider_id="Prod-Datadog", + repository="keephq/keep", + tags=[], + service="storage", + display_name="Storage", + environment="prod", + description="Storage Service", + team="keep", + email="support@keephq.dev", + slack="https://slack.keephq.dev", + ip_address="10.0.0.8", + mac_address="", + category="python", + manufacturer="", + dependencies={}, + application_ids=[], + updated_at="2024-11-18T10:13:56" + ) +] + +application_to_create = { + "name": "Main App", + "description": "It is the most critical business process ever imaginable.", + "services": [ + {"name": "API Service", "service": "api"}, + {"name": "DB", "service": "db"}, + {"name": "Kafka", "service": "queue"}, + {"name": "Processor", "service": "processor"}, + {"name": "Storage", "service": "storage"} + ] +} + +def get_or_create_topology(keep_api_key, keep_api_url): + services_existing = requests.get( + f"{keep_api_url}/topology", + headers={"x-api-key": keep_api_key}, + ) + services_existing.raise_for_status() + services_existing = services_existing.json() + + if len(services_existing) == 0 or True: + process_topology( + SINGLE_TENANT_UUID, + services_to_create, + "Prod-Datadog", + "datadog" + ) + + # Create application + applications_existing = requests.get( + f"{keep_api_url}/topology/applications", + headers={"x-api-key": keep_api_key}, + ) + applications_existing.raise_for_status() + applications_existing = applications_existing.json() + + if len(applications_existing) == 0: + # Pull services again to get their ids + services_existing = requests.get( + f"{keep_api_url}/topology", + headers={"x-api-key": keep_api_key}, + ) + services_existing.raise_for_status() + services_existing = services_existing.json() + + # Update application_to_create with existing services ids + for service in application_to_create["services"]: + for existing_service in services_existing: + if service["name"] == existing_service["display_name"]: + service["id"] = existing_service["id"] + + response = requests.post( + f"{keep_api_url}/topology/applications", + headers={"x-api-key": keep_api_key}, + json=application_to_create, + ) + response.raise_for_status() + def get_or_create_correlation_rules(keep_api_key, keep_api_url): correlation_rules_existing = requests.get( @@ -100,7 +307,13 @@ def remove_old_incidents(keep_api_key, keep_api_url): response.raise_for_status() -async def simulate_alerts(keep_api_url=None, keep_api_key=None, sleep_interval=5, demo_correlation_rules=False): +async def simulate_alerts( + keep_api_url=None, + keep_api_key=None, + sleep_interval=5, + demo_correlation_rules=False, + demo_topology=False + ): GENERATE_DEDUPLICATIONS = True providers = ["prometheus", "grafana"] @@ -113,7 +326,11 @@ async def simulate_alerts(keep_api_url=None, keep_api_key=None, sleep_interval=5 # Wait in the beginning because server may not be ready yet. await asyncio.sleep(sleep_interval * 2) - get_or_create_correlation_rules(keep_api_key, keep_api_url) + if demo_correlation_rules: + get_or_create_correlation_rules(keep_api_key, keep_api_url) + + if demo_topology: + get_or_create_topology(keep_api_key, keep_api_url) while True: await asyncio.sleep(sleep_interval) @@ -170,7 +387,8 @@ def launch_demo_mode(): keep_api_url, keep_api_key, sleep_interval=5, - demo_correlation_rules=True + demo_correlation_rules=True, + demo_topology=True ), )) thread.start() logger.info("Simulate Alert launched.") diff --git a/keep/providers/grafana_provider/alerts_mock.py b/keep/providers/grafana_provider/alerts_mock.py index bcb940077..4d9d0357a 100644 --- a/keep/providers/grafana_provider/alerts_mock.py +++ b/keep/providers/grafana_provider/alerts_mock.py @@ -1,6 +1,7 @@ ALERTS = { "database_connection_failure": { "severity": "critical", + "service": "api", "title": "Database Connection Failure", "alerts": [ { @@ -48,6 +49,7 @@ ], }, "high_memory_usage": { + "service": "api", "payload": { "condition": "B", "data": [ @@ -92,6 +94,7 @@ }, }, "network_latency_high": { + "service": "db", "payload": { "condition": "C", "data": [ diff --git a/keep/providers/prometheus_provider/alerts_mock.py b/keep/providers/prometheus_provider/alerts_mock.py index 1287f1a68..3c03bd675 100644 --- a/keep/providers/prometheus_provider/alerts_mock.py +++ b/keep/providers/prometheus_provider/alerts_mock.py @@ -11,7 +11,7 @@ }, "parameters": { "labels.host": ["host1", "host2", "host3"], - "labels.service": ["calendar-producer-java-otel-api-dd", "kafka"], + "labels.service": ["calendar-producer-java-otel-api-dd", "kafka", "api", "queue", "db"], "labels.instance": ["instance1", "instance2", "instance3"], }, }, @@ -20,11 +20,12 @@ "summary": "Message queue is over 33% capacity", "labels": { "severity": "warning", + "customer_id": "acme" }, }, "parameters": { "labels.queue": ["queue1", "queue2", "queue3"], - "labels.service": ["calendar-producer-java-otel-api-dd", "kafka"], + "labels.service": ["calendar-producer-java-otel-api-dd", "kafka", "queue"], "labels.mq_manager": ["mq_manager1", "mq_manager2", "mq_manager3"], }, }, @@ -37,7 +38,7 @@ }, "parameters": { "labels.host": ["host1", "host2", "host3"], - "labels.service": ["calendar-producer-java-otel-api-dd", "kafka"], + "labels.service": ["calendar-producer-java-otel-api-dd", "kafka", "api", "queue", "db"], "labels.instance": ["instance1", "instance2", "instance3"], }, }, @@ -50,7 +51,7 @@ }, "parameters": { "labels.host": ["host1", "host2", "host3"], - "labels.service": ["calendar-producer-java-otel-api-dd", "kafka"], + "labels.service": ["calendar-producer-java-otel-api-dd", "kafka", "api", "queue", "db"], "labels.instance": ["instance1", "instance2", "instance3"], }, }, From 294a7924d5819b8d2064069890e4955f318cf17f Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Mon, 18 Nov 2024 16:52:28 +0200 Subject: [PATCH 08/28] Fingerprint randomization --- keep/api/core/demo_mode_runner.py | 29 ++++++++++++++++--- .../cloudwatch_provider/alerts_mock.py | 2 +- .../providers/datadog_provider/alerts_mock.py | 8 ++--- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index 3f4b4f74e..084091004 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -7,6 +7,7 @@ import time import datetime from datetime import timezone +from requests.models import PreparedRequest from keep.api.core.db import get_session_sync from keep.api.core.dependencies import SINGLE_TENANT_UUID @@ -316,7 +317,17 @@ async def simulate_alerts( ): GENERATE_DEDUPLICATIONS = True - providers = ["prometheus", "grafana"] + providers = [ + "prometheus", + "grafana", + "cloudwatch", + "datadog", + ] + + providers_to_randomize_fingerprint_for = [ + "cloudwatch", + "datadog", + ] provider_classes = { provider: ProvidersFactory.get_provider_class(provider) @@ -333,8 +344,6 @@ async def simulate_alerts( get_or_create_topology(keep_api_key, keep_api_url) while True: - await asyncio.sleep(sleep_interval) - remove_old_incidents(keep_api_key, keep_api_url) # choose provider @@ -344,6 +353,12 @@ async def simulate_alerts( provider = provider_classes[provider_type] alert = provider.simulate_alert() + send_alert_url_params = {} + + if provider_type in providers_to_randomize_fingerprint_for: + send_alert_url_params['fingerprint'] = \ + ''.join(random.choices('abcdefghijklmnopqrstuvwxyz0123456789', k=10)) + # Determine number of times to send the same alert num_iterations = 1 if GENERATE_DEDUPLICATIONS: @@ -353,8 +368,11 @@ async def simulate_alerts( logger.info("Sending alert: {}".format(alert)) try: env = random.choice(["production", "staging", "development"]) + send_alert_url_params['provider_id'] = f"{provider_type}-{env}" + prepared_request = PreparedRequest() + prepared_request.prepare_url(send_alert_url, send_alert_url_params) response = requests.post( - send_alert_url + f"?provider_id={provider_type}-{env}", + prepared_request.url, headers={"x-api-key": keep_api_key}, json=alert, ) @@ -369,6 +387,9 @@ async def simulate_alerts( else: logger.info("Alert sent successfully") + await asyncio.sleep(sleep_interval) + + def launch_demo_mode(): """ Running async demo in the backgound. diff --git a/keep/providers/cloudwatch_provider/alerts_mock.py b/keep/providers/cloudwatch_provider/alerts_mock.py index 5fe2ec41e..1fc2377fe 100644 --- a/keep/providers/cloudwatch_provider/alerts_mock.py +++ b/keep/providers/cloudwatch_provider/alerts_mock.py @@ -11,7 +11,7 @@ } }, "parameters": { - "Message.AlarmName": ["HighCPUUsage-1", "HighCPUUsage-2", "HighCPUUsage-3"], + "Message.AlarmName": ["HighCPUUsage", "HighCPUUsageOnAPod", "PodRecycled"], }, }, } diff --git a/keep/providers/datadog_provider/alerts_mock.py b/keep/providers/datadog_provider/alerts_mock.py index 21205eab0..0ff032a3b 100644 --- a/keep/providers/datadog_provider/alerts_mock.py +++ b/keep/providers/datadog_provider/alerts_mock.py @@ -11,8 +11,8 @@ }, "parameters": { "tags": [ - "environment:production,team:backend,monitor", - "environment:staging,team:backend,monitor", + "environment:production,team:backend,monitor,service:api", + "environment:staging,team:backend,monitor,service:api", ], "priority": ["P2", "P3", "P4"], }, @@ -29,8 +29,8 @@ }, "parameters": { "tags": [ - "environment:production,team:analytics,monitor", - "environment:staging,team:database,monitor", + "environment:production,team:analytics,monitor,service:api", + "environment:staging,team:database,monitor,service:api", ], "priority": ["P1", "P3", "P4"], }, From 42cbb09620d10fb0737429a5d04de83eacd9923b Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Mon, 18 Nov 2024 17:06:46 +0200 Subject: [PATCH 09/28] conditional import --- keep/api/config.py | 8 ++++++-- keep/api/core/demo_mode_runner.py | 25 ++++++++++--------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/keep/api/config.py b/keep/api/config.py index d1e561479..58b02cf13 100644 --- a/keep/api/config.py +++ b/keep/api/config.py @@ -5,7 +5,6 @@ from keep.api.api import AUTH_TYPE from keep.api.core.db_on_start import migrate_db, try_create_single_tenant from keep.api.core.report_uptime import launch_uptime_reporting -from keep.api.core.demo_mode_runner import launch_demo_mode from keep.api.core.dependencies import SINGLE_TENANT_UUID from keep.identitymanager.identitymanagerfactory import IdentityManagerTypes @@ -14,6 +13,8 @@ keep.api.logging.setup_logging() logger = logging.getLogger(__name__) +LIVE_DEMO_MODE = os.environ.get("LIVE_DEMO_MODE", "false").lower() == "true" + def on_starting(server=None): """This function is called by the gunicorn server when it starts""" @@ -21,7 +22,10 @@ def on_starting(server=None): migrate_db() launch_uptime_reporting() - launch_demo_mode() + + if LIVE_DEMO_MODE: + from keep.api.core.demo_mode_runner import launch_demo_mode + launch_demo_mode() # Create single tenant if it doesn't exist if AUTH_TYPE in [ diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index 084091004..cf7d609a0 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -24,8 +24,6 @@ logger = logging.getLogger(__name__) -LIVE_DEMO_MODE = os.environ.get("LIVE_DEMO_MODE", "false").lower() == "true" - correlation_rules_to_create = [ { "sqlQuery": { @@ -230,7 +228,7 @@ def get_or_create_topology(keep_api_key, keep_api_url): services_existing.raise_for_status() services_existing = services_existing.json() - if len(services_existing) == 0 or True: + if len(services_existing) == 0: process_topology( SINGLE_TENANT_UUID, services_to_create, @@ -403,15 +401,12 @@ def launch_demo_mode(): system_description="Simulate Alerts API key", )) - if LIVE_DEMO_MODE: - thread = threading.Thread(target=asyncio.run, args=(simulate_alerts( - keep_api_url, - keep_api_key, - sleep_interval=5, - demo_correlation_rules=True, - demo_topology=True - ), )) - thread.start() - logger.info("Simulate Alert launched.") - else: - logger.info("Alert simulation is disabled.") + thread = threading.Thread(target=asyncio.run, args=(simulate_alerts( + keep_api_url, + keep_api_key, + sleep_interval=5, + demo_correlation_rules=True, + demo_topology=True + ), )) + thread.start() + logger.info("Simulate Alert launched.") From b34c1df3c399d3d19cbe4d2b10b9b0e1ff2b6494 Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Mon, 18 Nov 2024 18:11:49 +0200 Subject: [PATCH 10/28] Update keep/api/config.py Co-authored-by: Tal Signed-off-by: Matvey Kukuy --- keep/api/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keep/api/config.py b/keep/api/config.py index 58b02cf13..bf5634ce6 100644 --- a/keep/api/config.py +++ b/keep/api/config.py @@ -13,7 +13,7 @@ keep.api.logging.setup_logging() logger = logging.getLogger(__name__) -LIVE_DEMO_MODE = os.environ.get("LIVE_DEMO_MODE", "false").lower() == "true" +LIVE_DEMO_MODE = os.environ.get("KEEP_LIVE_DEMO_MODE", "false").lower() == "true" def on_starting(server=None): From ac3b392d49bc6e21d6a6df71a59981abbb0c3fa2 Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Mon, 18 Nov 2024 18:11:55 +0200 Subject: [PATCH 11/28] Update docs/deployment/configuration.mdx Co-authored-by: Tal Signed-off-by: Matvey Kukuy --- docs/deployment/configuration.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deployment/configuration.mdx b/docs/deployment/configuration.mdx index e66f9eb0b..ddf404912 100644 --- a/docs/deployment/configuration.mdx +++ b/docs/deployment/configuration.mdx @@ -25,7 +25,7 @@ General configuration variables control the core behavior of the Keep server. Th | **KEEP_API_URL** | Specifies the Keep API URL | No | Constructed from HOST and PORT | Valid URL | | **KEEP_STORE_RAW_ALERTS** | Enables storing of raw alerts | No | "false" | "true" or "false" | | **TENANT_CONFIGURATION_RELOAD_TIME** | Time in minutes to reload tenant configurations | No | 5 | Positive integer | -| **LIVE_DEMO_MODE** | Keep will simulate incoming alerts and other activity | No | "false" | "true" or "false" | +| **KEEP_LIVE_DEMO_MODE** | Keep will simulate incoming alerts and other activity | No | "false" | "true" or "false" | ### Logging and Environment From b8edba1858a02b2dd708b944cae9d97fa27766e0 Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Mon, 18 Nov 2024 19:02:11 +0200 Subject: [PATCH 12/28] Fix --- keep/api/config.py | 9 +++++---- keep/api/core/demo_mode_runner.py | 23 +++++++++++++---------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/keep/api/config.py b/keep/api/config.py index 58b02cf13..1a143c519 100644 --- a/keep/api/config.py +++ b/keep/api/config.py @@ -23,10 +23,6 @@ def on_starting(server=None): migrate_db() launch_uptime_reporting() - if LIVE_DEMO_MODE: - from keep.api.core.demo_mode_runner import launch_demo_mode - launch_demo_mode() - # Create single tenant if it doesn't exist if AUTH_TYPE in [ IdentityManagerTypes.DB.value, @@ -60,4 +56,9 @@ def on_starting(server=None): public_url = ngrok_connection.public_url logger.info(f"ngrok tunnel: {public_url}") os.environ["KEEP_API_URL"] = public_url + + if LIVE_DEMO_MODE: + from keep.api.core.demo_mode_runner import launch_demo_mode + launch_demo_mode() + logger.info("Keep server started") diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index cf7d609a0..09248f23f 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -7,6 +7,7 @@ import time import datetime from datetime import timezone +from dateutil import parser from requests.models import PreparedRequest from keep.api.core.db import get_session_sync @@ -295,9 +296,8 @@ def remove_old_incidents(keep_api_key, keep_api_url): incidents_existing = incidents_existing.json()['items'] for incident in incidents_existing: - if datetime.datetime.strptime( - incident["creation_time"], "%Y-%m-%dT%H:%M:%S.%f" - ).replace(tzinfo=timezone.utc) < (datetime.datetime.now() - consider_old_timedelta).astimezone(timezone.utc): + if parser.parse(incident["creation_time"]).replace(tzinfo=timezone.utc) < \ + (datetime.datetime.now() - consider_old_timedelta).astimezone(timezone.utc): incident_id = incident["id"] response = requests.delete( f"{keep_api_url}/incidents/{incident_id}", @@ -393,13 +393,16 @@ def launch_demo_mode(): Running async demo in the backgound. """ keep_api_url = "http://localhost:" + str(os.environ.get("PORT", 8080)) - keep_api_key = os.environ.get("KEEP_API_KEY", get_or_create_api_key( - session=get_session_sync(), - tenant_id=SINGLE_TENANT_UUID, - created_by="system", - unique_api_key_id="simulate_alerts", - system_description="Simulate Alerts API key", - )) + keep_api_key = os.environ.get("KEEP_API_KEY") + if keep_api_key is None: + with get_session_sync() as session: + keep_api_key = get_or_create_api_key( + session=session, + tenant_id=SINGLE_TENANT_UUID, + created_by="system", + unique_api_key_id="simulate_alerts", + system_description="Simulate Alerts API key", + ) thread = threading.Thread(target=asyncio.run, args=(simulate_alerts( keep_api_url, From ce121da15c7482666b784e608ef8637b92502c80 Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Mon, 18 Nov 2024 20:22:47 +0200 Subject: [PATCH 13/28] Up --- keep/api/core/demo_mode_runner.py | 55 ++++++++++++++++--------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index 09248f23f..00ee0c7a5 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -229,6 +229,8 @@ def get_or_create_topology(keep_api_key, keep_api_url): services_existing.raise_for_status() services_existing = services_existing.json() + # Creating services + if len(services_existing) == 0: process_topology( SINGLE_TENANT_UUID, @@ -237,35 +239,35 @@ def get_or_create_topology(keep_api_key, keep_api_url): "datadog" ) - # Create application - applications_existing = requests.get( - f"{keep_api_url}/topology/applications", - headers={"x-api-key": keep_api_key}, - ) - applications_existing.raise_for_status() - applications_existing = applications_existing.json() - - if len(applications_existing) == 0: - # Pull services again to get their ids - services_existing = requests.get( - f"{keep_api_url}/topology", - headers={"x-api-key": keep_api_key}, - ) - services_existing.raise_for_status() - services_existing = services_existing.json() - - # Update application_to_create with existing services ids - for service in application_to_create["services"]: - for existing_service in services_existing: - if service["name"] == existing_service["display_name"]: - service["id"] = existing_service["id"] - - response = requests.post( + # Create application + applications_existing = requests.get( f"{keep_api_url}/topology/applications", headers={"x-api-key": keep_api_key}, - json=application_to_create, ) - response.raise_for_status() + applications_existing.raise_for_status() + applications_existing = applications_existing.json() + + if len(applications_existing) == 0: + # Pull services again to get their ids + services_existing = requests.get( + f"{keep_api_url}/topology", + headers={"x-api-key": keep_api_key}, + ) + services_existing.raise_for_status() + services_existing = services_existing.json() + + # Update application_to_create with existing services ids + for service in application_to_create["services"]: + for existing_service in services_existing: + if service["name"] == existing_service["display_name"]: + service["id"] = existing_service["id"] + + response = requests.post( + f"{keep_api_url}/topology/applications", + headers={"x-api-key": keep_api_key}, + json=application_to_create, + ) + response.raise_for_status() def get_or_create_correlation_rules(keep_api_key, keep_api_url): @@ -342,6 +344,7 @@ async def simulate_alerts( get_or_create_topology(keep_api_key, keep_api_url) while True: + logger.info("Looping to send alerts...") remove_old_incidents(keep_api_key, keep_api_url) # choose provider From 89faba0d846aeb85b6692072704d0001b689c930 Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Mon, 18 Nov 2024 20:40:09 +0200 Subject: [PATCH 14/28] More logs --- keep/api/core/demo_mode_runner.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index 00ee0c7a5..e559c2f81 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -312,8 +312,6 @@ async def simulate_alerts( keep_api_url=None, keep_api_key=None, sleep_interval=5, - demo_correlation_rules=False, - demo_topology=False ): GENERATE_DEDUPLICATIONS = True @@ -337,15 +335,11 @@ async def simulate_alerts( # Wait in the beginning because server may not be ready yet. await asyncio.sleep(sleep_interval * 2) - if demo_correlation_rules: - get_or_create_correlation_rules(keep_api_key, keep_api_url) - - if demo_topology: - get_or_create_topology(keep_api_key, keep_api_url) - while True: logger.info("Looping to send alerts...") + remove_old_incidents(keep_api_key, keep_api_url) + logger.info("Old incidents removed.") # choose provider provider_type = random.choice(providers) @@ -395,6 +389,8 @@ def launch_demo_mode(): """ Running async demo in the backgound. """ + logger.info("Demo mode launched.") + keep_api_url = "http://localhost:" + str(os.environ.get("PORT", 8080)) keep_api_key = os.environ.get("KEEP_API_KEY") if keep_api_key is None: @@ -407,6 +403,12 @@ def launch_demo_mode(): system_description="Simulate Alerts API key", ) + logger.info(f"Creating correlation rules for the demo mode.") + get_or_create_correlation_rules(keep_api_key, keep_api_url) + + logger.info(f"Creating topology for the demo mode.") + get_or_create_topology(keep_api_key, keep_api_url) + thread = threading.Thread(target=asyncio.run, args=(simulate_alerts( keep_api_url, keep_api_key, @@ -415,4 +417,4 @@ def launch_demo_mode(): demo_topology=True ), )) thread.start() - logger.info("Simulate Alert launched.") + logger.info("Demo mode initialized.") From 3fe40716dcc7a1036cc3e0b3cbc0b17aca5e166a Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Mon, 18 Nov 2024 21:12:49 +0200 Subject: [PATCH 15/28] Waiting for API to get live --- keep/api/core/demo_mode_runner.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index e559c2f81..76fdfd960 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -312,6 +312,8 @@ async def simulate_alerts( keep_api_url=None, keep_api_key=None, sleep_interval=5, + demo_correlation_rules=False, + demo_topology=False, ): GENERATE_DEDUPLICATIONS = True @@ -332,12 +334,29 @@ async def simulate_alerts( for provider in providers } - # Wait in the beginning because server may not be ready yet. - await asyncio.sleep(sleep_interval * 2) + while True: + try: + logger.info(f"Demo thread: Checking if server is up at {keep_api_url}...") + response = requests.get(keep_api_url) + response.raise_for_status() + break + except requests.exceptions.RequestException as e: + logger.info(f"Demo thread: API is not up yet. Waiting...") + await asyncio.sleep(5) + + if demo_correlation_rules: + get_or_create_correlation_rules(keep_api_key, keep_api_url) + logger.info("Correlation rules created.") + if demo_topology: + get_or_create_topology(keep_api_key, keep_api_url) + logger.info("Topology created.") + + logger.info(f"Waiting for server to start...") + while True: logger.info("Looping to send alerts...") - + remove_old_incidents(keep_api_key, keep_api_url) logger.info("Old incidents removed.") @@ -403,12 +422,6 @@ def launch_demo_mode(): system_description="Simulate Alerts API key", ) - logger.info(f"Creating correlation rules for the demo mode.") - get_or_create_correlation_rules(keep_api_key, keep_api_url) - - logger.info(f"Creating topology for the demo mode.") - get_or_create_topology(keep_api_key, keep_api_url) - thread = threading.Thread(target=asyncio.run, args=(simulate_alerts( keep_api_url, keep_api_key, From 0e329b208f6703804818630c88610d5db3afa532 Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Tue, 19 Nov 2024 10:58:17 +0200 Subject: [PATCH 16/28] when_ready --- keep/api/config.py | 12 +++++++++--- keep/api/core/demo_mode_runner.py | 10 ---------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/keep/api/config.py b/keep/api/config.py index 8f34bda11..791d2322b 100644 --- a/keep/api/config.py +++ b/keep/api/config.py @@ -21,7 +21,6 @@ def on_starting(server=None): logger.info("Keep server starting") migrate_db() - launch_uptime_reporting() # Create single tenant if it doesn't exist if AUTH_TYPE in [ @@ -57,8 +56,15 @@ def on_starting(server=None): logger.info(f"ngrok tunnel: {public_url}") os.environ["KEEP_API_URL"] = public_url + logger.info("Keep server started") + + +def when_ready(server): + """This function is called by the gunicorn server when it is ready to accept requests""" + logger.info("Keep server ready") + + launch_uptime_reporting() + if LIVE_DEMO_MODE: from keep.api.core.demo_mode_runner import launch_demo_mode launch_demo_mode() - - logger.info("Keep server started") diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index 76fdfd960..03f7f149a 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -334,16 +334,6 @@ async def simulate_alerts( for provider in providers } - while True: - try: - logger.info(f"Demo thread: Checking if server is up at {keep_api_url}...") - response = requests.get(keep_api_url) - response.raise_for_status() - break - except requests.exceptions.RequestException as e: - logger.info(f"Demo thread: API is not up yet. Waiting...") - await asyncio.sleep(5) - if demo_correlation_rules: get_or_create_correlation_rules(keep_api_key, keep_api_url) logger.info("Correlation rules created.") From d96465b98054c2f204c5510491c6ea38bc436ff3 Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Tue, 19 Nov 2024 12:21:30 +0200 Subject: [PATCH 17/28] Fix --- keep/api/config.py | 3 +++ keep/api/core/demo_mode_runner.py | 30 ++++++++++++++++++------------ 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/keep/api/config.py b/keep/api/config.py index 791d2322b..69273e3ca 100644 --- a/keep/api/config.py +++ b/keep/api/config.py @@ -66,5 +66,8 @@ def when_ready(server): launch_uptime_reporting() if LIVE_DEMO_MODE: + logger.info("Launching Keep in demo mode.") from keep.api.core.demo_mode_runner import launch_demo_mode launch_demo_mode() + else: + logger.info("Not launching Keep in demo mode.") diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index 03f7f149a..11ad6e721 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -1,14 +1,15 @@ import os -import requests +import time +import random import asyncio import logging -import threading -import random -import time +import requests import datetime + from datetime import timezone from dateutil import parser from requests.models import PreparedRequest +from multiprocessing import Process from keep.api.core.db import get_session_sync from keep.api.core.dependencies import SINGLE_TENANT_UUID @@ -412,12 +413,17 @@ def launch_demo_mode(): system_description="Simulate Alerts API key", ) - thread = threading.Thread(target=asyncio.run, args=(simulate_alerts( - keep_api_url, - keep_api_key, - sleep_interval=5, - demo_correlation_rules=True, - demo_topology=True - ), )) - thread.start() + p = Process( + target=asyncio.run, + args=( + simulate_alerts( + keep_api_url, + keep_api_key, + sleep_interval=5, + demo_correlation_rules=True, + demo_topology=True + ), + ) + ) + p.start() logger.info("Demo mode initialized.") From 0fe790dec43e0809933af12fcaf566b6b48533e3 Mon Sep 17 00:00:00 2001 From: Tal Borenstein Date: Tue, 19 Nov 2024 13:12:44 +0200 Subject: [PATCH 18/28] fix: fix --- keep/api/core/demo_mode_runner.py | 130 ++++++++++++++---------------- 1 file changed, 59 insertions(+), 71 deletions(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index 11ad6e721..f838bafdd 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -1,47 +1,38 @@ -import os -import time -import random import asyncio -import logging -import requests import datetime - +import logging +import os +import random +import time from datetime import timezone + +import requests from dateutil import parser from requests.models import PreparedRequest -from multiprocessing import Process from keep.api.core.db import get_session_sync from keep.api.core.dependencies import SINGLE_TENANT_UUID +from keep.api.logging import CONFIG from keep.api.models.db.topology import TopologyServiceInDto from keep.api.tasks.process_topology_task import process_topology from keep.api.utils.tenant_utils import get_or_create_api_key from keep.providers.providers_factory import ProvidersFactory -logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s %(levelname)s %(name)s %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", -) +logging.config.dictConfig(CONFIG) logger = logging.getLogger(__name__) correlation_rules_to_create = [ { - "sqlQuery": { - "sql": "((name like :name_1))", - "params": { - "name_1": "%mq%" - } - }, + "sqlQuery": {"sql": "((name like :name_1))", "params": {"name_1": "%mq%"}}, "groupDescription": "This rule groups all alerts related to MQ.", "ruleName": "Message Queue Buckle Up", - "celQuery": "(name.contains(\"mq\"))", + "celQuery": '(name.contains("mq"))', "timeframeInSeconds": 86400, "timeUnit": "hours", "groupingCriteria": [], "requireApprove": False, - "resolveOn": "never" + "resolveOn": "never", }, { "sqlQuery": { @@ -49,17 +40,17 @@ "params": { "name_1": "%network_latency_high%", "name_2": "high_cpu_usage", - "name_3": "%database_connection_failure%" - } + "name_3": "%database_connection_failure%", + }, }, "groupDescription": "This rule groups alerts from multiple sources.", "ruleName": "Application issue caused by DB load", - "celQuery": "(name.contains(\"network_latency_high\")) || (name == \"high_cpu_usage\") || (name.contains(\"database_connection_failure\"))", + "celQuery": '(name.contains("network_latency_high")) || (name == "high_cpu_usage") || (name.contains("database_connection_failure"))', "timeframeInSeconds": 86400, "timeUnit": "hours", "groupingCriteria": [], "requireApprove": False, - "resolveOn": "never" + "resolveOn": "never", }, ] @@ -84,7 +75,7 @@ "queue": "AMQP", }, application_ids=[], - updated_at="2024-11-18T09:23:46" + updated_at="2024-11-18T09:23:46", ), TopologyServiceInDto( source_provider_id="Prod-Datadog", @@ -105,7 +96,7 @@ "api": "HTTP/S", }, application_ids=[], - updated_at="2024-11-18T09:29:25" + updated_at="2024-11-18T09:29:25", ), TopologyServiceInDto( source_provider_id="Prod-Datadog", @@ -124,7 +115,7 @@ manufacturer="", dependencies={}, application_ids=[], - updated_at="2024-11-18T09:30:44" + updated_at="2024-11-18T09:30:44", ), TopologyServiceInDto( source_provider_id="Prod-Datadog", @@ -145,7 +136,7 @@ "processor": "AMQP", }, application_ids=[], - updated_at="2024-11-18T09:31:31" + updated_at="2024-11-18T09:31:31", ), TopologyServiceInDto( source_provider_id="Prod-Datadog", @@ -166,7 +157,7 @@ "storage": "HTTP/S", }, application_ids=[], - updated_at="2024-11-18T10:02:20" + updated_at="2024-11-18T10:02:20", ), TopologyServiceInDto( source_provider_id="Prod-Datadog", @@ -187,7 +178,7 @@ "api": "HTTP/S", }, application_ids=[], - updated_at="2024-11-18T10:11:31" + updated_at="2024-11-18T10:11:31", ), TopologyServiceInDto( source_provider_id="Prod-Datadog", @@ -206,8 +197,8 @@ manufacturer="", dependencies={}, application_ids=[], - updated_at="2024-11-18T10:13:56" - ) + updated_at="2024-11-18T10:13:56", + ), ] application_to_create = { @@ -218,10 +209,11 @@ {"name": "DB", "service": "db"}, {"name": "Kafka", "service": "queue"}, {"name": "Processor", "service": "processor"}, - {"name": "Storage", "service": "storage"} - ] + {"name": "Storage", "service": "storage"}, + ], } + def get_or_create_topology(keep_api_key, keep_api_url): services_existing = requests.get( f"{keep_api_url}/topology", @@ -234,10 +226,7 @@ def get_or_create_topology(keep_api_key, keep_api_url): if len(services_existing) == 0: process_topology( - SINGLE_TENANT_UUID, - services_to_create, - "Prod-Datadog", - "datadog" + SINGLE_TENANT_UUID, services_to_create, "Prod-Datadog", "datadog" ) # Create application @@ -247,7 +236,7 @@ def get_or_create_topology(keep_api_key, keep_api_url): ) applications_existing.raise_for_status() applications_existing = applications_existing.json() - + if len(applications_existing) == 0: # Pull services again to get their ids services_existing = requests.get( @@ -262,14 +251,14 @@ def get_or_create_topology(keep_api_key, keep_api_url): for existing_service in services_existing: if service["name"] == existing_service["display_name"]: service["id"] = existing_service["id"] - + response = requests.post( f"{keep_api_url}/topology/applications", headers={"x-api-key": keep_api_key}, json=application_to_create, ) response.raise_for_status() - + def get_or_create_correlation_rules(keep_api_key, keep_api_url): correlation_rules_existing = requests.get( @@ -296,11 +285,12 @@ def remove_old_incidents(keep_api_key, keep_api_url): headers={"x-api-key": keep_api_key}, ) incidents_existing.raise_for_status() - incidents_existing = incidents_existing.json()['items'] + incidents_existing = incidents_existing.json()["items"] for incident in incidents_existing: - if parser.parse(incident["creation_time"]).replace(tzinfo=timezone.utc) < \ - (datetime.datetime.now() - consider_old_timedelta).astimezone(timezone.utc): + if parser.parse(incident["creation_time"]).replace(tzinfo=timezone.utc) < ( + datetime.datetime.now() - consider_old_timedelta + ).astimezone(timezone.utc): incident_id = incident["id"] response = requests.delete( f"{keep_api_url}/incidents/{incident_id}", @@ -310,16 +300,17 @@ def remove_old_incidents(keep_api_key, keep_api_url): async def simulate_alerts( - keep_api_url=None, - keep_api_key=None, - sleep_interval=5, - demo_correlation_rules=False, - demo_topology=False, - ): + keep_api_url=None, + keep_api_key=None, + sleep_interval=5, + demo_correlation_rules=False, + demo_topology=False, +): + logger.info("Simulating alerts...") GENERATE_DEDUPLICATIONS = True providers = [ - "prometheus", + "prometheus", "grafana", "cloudwatch", "datadog", @@ -336,33 +327,33 @@ async def simulate_alerts( } if demo_correlation_rules: + logger.info("Creating correlation rules...") get_or_create_correlation_rules(keep_api_key, keep_api_url) logger.info("Correlation rules created.") if demo_topology: + logger.info("Creating topology...") get_or_create_topology(keep_api_key, keep_api_url) logger.info("Topology created.") - logger.info(f"Waiting for server to start...") - - while True: logger.info("Looping to send alerts...") + logger.info("Removing old incidents...") remove_old_incidents(keep_api_key, keep_api_url) logger.info("Old incidents removed.") # choose provider provider_type = random.choice(providers) - send_alert_url = "{}/alerts/event/{}".format( - keep_api_url, provider_type) + send_alert_url = "{}/alerts/event/{}".format(keep_api_url, provider_type) provider = provider_classes[provider_type] alert = provider.simulate_alert() send_alert_url_params = {} if provider_type in providers_to_randomize_fingerprint_for: - send_alert_url_params['fingerprint'] = \ - ''.join(random.choices('abcdefghijklmnopqrstuvwxyz0123456789', k=10)) + send_alert_url_params["fingerprint"] = "".join( + random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=10) + ) # Determine number of times to send the same alert num_iterations = 1 @@ -373,7 +364,7 @@ async def simulate_alerts( logger.info("Sending alert: {}".format(alert)) try: env = random.choice(["production", "staging", "development"]) - send_alert_url_params['provider_id'] = f"{provider_type}-{env}" + send_alert_url_params["provider_id"] = f"{provider_type}-{env}" prepared_request = PreparedRequest() prepared_request.prepare_url(send_alert_url, send_alert_url_params) response = requests.post( @@ -402,7 +393,7 @@ def launch_demo_mode(): logger.info("Demo mode launched.") keep_api_url = "http://localhost:" + str(os.environ.get("PORT", 8080)) - keep_api_key = os.environ.get("KEEP_API_KEY") + keep_api_key = os.environ.get("KEEP_READ_ONLY_BYPASS_KEY") if keep_api_key is None: with get_session_sync() as session: keep_api_key = get_or_create_api_key( @@ -413,17 +404,14 @@ def launch_demo_mode(): system_description="Simulate Alerts API key", ) - p = Process( - target=asyncio.run, - args=( - simulate_alerts( - keep_api_url, - keep_api_key, - sleep_interval=5, - demo_correlation_rules=True, - demo_topology=True - ), + loop = asyncio.get_event_loop() + loop.create_task( + simulate_alerts( + keep_api_url, + keep_api_key, + sleep_interval=5, + demo_correlation_rules=True, + demo_topology=True, ) ) - p.start() logger.info("Demo mode initialized.") From 353e42cf752ca0179031e16937790b70b06fa9da Mon Sep 17 00:00:00 2001 From: Tal Borenstein Date: Tue, 19 Nov 2024 13:32:40 +0200 Subject: [PATCH 19/28] fix: fix --- keep/api/core/demo_mode_runner.py | 34 ++++++++++++++++++------------- scripts/simulate_alerts.py | 13 ++++++------ 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index f838bafdd..886fd480b 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -1,8 +1,8 @@ -import asyncio import datetime import logging import os import random +import threading import time from datetime import timezone @@ -299,7 +299,7 @@ def remove_old_incidents(keep_api_key, keep_api_url): response.raise_for_status() -async def simulate_alerts( +def simulate_alerts( keep_api_url=None, keep_api_key=None, sleep_interval=5, @@ -378,12 +378,15 @@ async def simulate_alerts( time.sleep(sleep_interval) continue - if response.status_code != 202: + if not response.ok: logger.error("Failed to send alert: {}".format(response.text)) else: logger.info("Alert sent successfully") - await asyncio.sleep(sleep_interval) + logger.info( + "Sleeping for {} seconds before next iteration".format(sleep_interval) + ) + time.sleep(sleep_interval) def launch_demo_mode(): @@ -392,7 +395,9 @@ def launch_demo_mode(): """ logger.info("Demo mode launched.") - keep_api_url = "http://localhost:" + str(os.environ.get("PORT", 8080)) + keep_api_url = os.environ.get( + "KEEP_API_URL", "http://localhost:" + str(os.environ.get("PORT", 8080)) + ) keep_api_key = os.environ.get("KEEP_READ_ONLY_BYPASS_KEY") if keep_api_key is None: with get_session_sync() as session: @@ -404,14 +409,15 @@ def launch_demo_mode(): system_description="Simulate Alerts API key", ) - loop = asyncio.get_event_loop() - loop.create_task( - simulate_alerts( - keep_api_url, - keep_api_key, - sleep_interval=5, - demo_correlation_rules=True, - demo_topology=True, - ) + thread = threading.Thread( + target=simulate_alerts, + kwargs={ + "keep_api_key": keep_api_key, + "keep_api_url": keep_api_url, + "sleep_interval": 0.2, + "demo_correlation_rules": True, + }, ) + thread.daemon = True + thread.start() logger.info("Demo mode initialized.") diff --git a/scripts/simulate_alerts.py b/scripts/simulate_alerts.py index abf7dafa1..56f273efb 100644 --- a/scripts/simulate_alerts.py +++ b/scripts/simulate_alerts.py @@ -1,6 +1,5 @@ -import os import logging -import asyncio +import os from keep.api.core.demo_mode_runner import simulate_alerts @@ -13,11 +12,13 @@ logger = logging.getLogger(__name__) -async def main(): - SLEEP_INTERVAL = float(os.environ.get("SLEEP_INTERVAL", 0.2)) # Configurable sleep interval from env variable +def main(): + SLEEP_INTERVAL = float( + os.environ.get("SLEEP_INTERVAL", 0.2) + ) # Configurable sleep interval from env variable keep_api_key = os.environ.get("KEEP_API_KEY") keep_api_url = os.environ.get("KEEP_API_URL") or "http://localhost:8080" - await simulate_alerts( + simulate_alerts( keep_api_key=keep_api_key, keep_api_url=keep_api_url, sleep_interval=SLEEP_INTERVAL, @@ -26,4 +27,4 @@ async def main(): if __name__ == "__main__": - asyncio.run(main()) + main() From e333e9ed84e1323fbd863613801f8b62f1f1f0d3 Mon Sep 17 00:00:00 2001 From: Tal Borenstein Date: Tue, 19 Nov 2024 13:51:33 +0200 Subject: [PATCH 20/28] fix: fix --- keep/api/core/demo_mode_runner.py | 87 ++++++++++++++++--------------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index 886fd480b..b7e90560a 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -336,52 +336,57 @@ def simulate_alerts( logger.info("Topology created.") while True: - logger.info("Looping to send alerts...") + try: + logger.info("Looping to send alerts...") - logger.info("Removing old incidents...") - remove_old_incidents(keep_api_key, keep_api_url) - logger.info("Old incidents removed.") + logger.info("Removing old incidents...") + remove_old_incidents(keep_api_key, keep_api_url) + logger.info("Old incidents removed.") - # choose provider - provider_type = random.choice(providers) - send_alert_url = "{}/alerts/event/{}".format(keep_api_url, provider_type) - provider = provider_classes[provider_type] - alert = provider.simulate_alert() + # choose provider + provider_type = random.choice(providers) + send_alert_url = "{}/alerts/event/{}".format(keep_api_url, provider_type) + provider = provider_classes[provider_type] + alert = provider.simulate_alert() - send_alert_url_params = {} + send_alert_url_params = {} - if provider_type in providers_to_randomize_fingerprint_for: - send_alert_url_params["fingerprint"] = "".join( - random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=10) - ) - - # Determine number of times to send the same alert - num_iterations = 1 - if GENERATE_DEDUPLICATIONS: - num_iterations = random.randint(1, 3) - - for _ in range(num_iterations): - logger.info("Sending alert: {}".format(alert)) - try: - env = random.choice(["production", "staging", "development"]) - send_alert_url_params["provider_id"] = f"{provider_type}-{env}" - prepared_request = PreparedRequest() - prepared_request.prepare_url(send_alert_url, send_alert_url_params) - response = requests.post( - prepared_request.url, - headers={"x-api-key": keep_api_key}, - json=alert, + if provider_type in providers_to_randomize_fingerprint_for: + send_alert_url_params["fingerprint"] = "".join( + random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=10) ) - response.raise_for_status() # Raise an HTTPError for bad responses - except requests.exceptions.RequestException as e: - logger.error("Failed to send alert: {}".format(e)) - time.sleep(sleep_interval) - continue - - if not response.ok: - logger.error("Failed to send alert: {}".format(response.text)) - else: - logger.info("Alert sent successfully") + + # Determine number of times to send the same alert + num_iterations = 1 + if GENERATE_DEDUPLICATIONS: + num_iterations = random.randint(1, 3) + + for _ in range(num_iterations): + logger.info("Sending alert: {}".format(alert)) + try: + env = random.choice(["production", "staging", "development"]) + send_alert_url_params["provider_id"] = f"{provider_type}-{env}" + prepared_request = PreparedRequest() + prepared_request.prepare_url(send_alert_url, send_alert_url_params) + response = requests.post( + prepared_request.url, + headers={"x-api-key": keep_api_key}, + json=alert, + ) + response.raise_for_status() # Raise an HTTPError for bad responses + except requests.exceptions.RequestException as e: + logger.error("Failed to send alert: {}".format(e)) + time.sleep(sleep_interval) + continue + + if not response.ok: + logger.error("Failed to send alert: {}".format(response.text)) + else: + logger.info("Alert sent successfully") + except Exception as e: + logger.exception( + "Error in simulate_alerts", extra={"exception_str": str(e)} + ) logger.info( "Sleeping for {} seconds before next iteration".format(sleep_interval) From 2256347405dfb990e8b79d984d65f8471c98141d Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Tue, 19 Nov 2024 14:16:25 +0200 Subject: [PATCH 21/28] wait for the connection --- keep/api/core/demo_mode_runner.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index b7e90560a..82e645a46 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -326,6 +326,16 @@ def simulate_alerts( for provider in providers } + while True: + try: + logger.info(f"Demo thread: Checking if server is up at {keep_api_url}...") + response = requests.get(keep_api_url) + response.raise_for_status() + break + except requests.exceptions.RequestException as e: + logger.info(f"Demo thread: API is not up yet. Waiting...") + time.sleep(5) + if demo_correlation_rules: logger.info("Creating correlation rules...") get_or_create_correlation_rules(keep_api_key, keep_api_url) From baacc1324f8060d9148105df6fc3cd30180a2ef9 Mon Sep 17 00:00:00 2001 From: Tal Borenstein Date: Tue, 19 Nov 2024 14:20:45 +0200 Subject: [PATCH 22/28] fix: fix --- keep/api/core/demo_mode_runner.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index 82e645a46..b158bd591 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -332,8 +332,8 @@ def simulate_alerts( response = requests.get(keep_api_url) response.raise_for_status() break - except requests.exceptions.RequestException as e: - logger.info(f"Demo thread: API is not up yet. Waiting...") + except requests.exceptions.RequestException: + logger.info("Demo thread: API is not up yet. Waiting...") time.sleep(5) if demo_correlation_rules: @@ -436,3 +436,7 @@ def launch_demo_mode(): thread.daemon = True thread.start() logger.info("Demo mode initialized.") + + +if __name__ == "__main__": + launch_demo_mode() From c1dd51a7b24ac1ace87f7790d415496430bad7ac Mon Sep 17 00:00:00 2001 From: Tal Borenstein Date: Tue, 19 Nov 2024 14:33:20 +0200 Subject: [PATCH 23/28] fix: fix --- keep/api/core/demo_mode_runner.py | 35 ++++++++++++++++++------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index b158bd591..a73b448f4 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -404,7 +404,7 @@ def simulate_alerts( time.sleep(sleep_interval) -def launch_demo_mode(): +def launch_demo_mode(use_thread: bool = True): """ Running async demo in the backgound. """ @@ -423,20 +423,27 @@ def launch_demo_mode(): unique_api_key_id="simulate_alerts", system_description="Simulate Alerts API key", ) - - thread = threading.Thread( - target=simulate_alerts, - kwargs={ - "keep_api_key": keep_api_key, - "keep_api_url": keep_api_url, - "sleep_interval": 0.2, - "demo_correlation_rules": True, - }, - ) - thread.daemon = True - thread.start() + if use_thread: + thread = threading.Thread( + target=simulate_alerts, + kwargs={ + "keep_api_key": keep_api_key, + "keep_api_url": keep_api_url, + "sleep_interval": 0.2, + "demo_correlation_rules": True, + }, + ) + thread.daemon = True + thread.start() + else: + simulate_alerts( + keep_api_key=keep_api_key, + keep_api_url=keep_api_url, + sleep_interval=0.2, + demo_correlation_rules=True, + ) logger.info("Demo mode initialized.") if __name__ == "__main__": - launch_demo_mode() + launch_demo_mode(use_thread=False) From 1e83a5c8cdfc1aefd7279a3f555c3d93459bfe4c Mon Sep 17 00:00:00 2001 From: Tal Borenstein Date: Tue, 19 Nov 2024 14:47:05 +0200 Subject: [PATCH 24/28] fix: timeout --- keep/api/core/demo_mode_runner.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index a73b448f4..76ea6a380 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -414,6 +414,7 @@ def launch_demo_mode(use_thread: bool = True): "KEEP_API_URL", "http://localhost:" + str(os.environ.get("PORT", 8080)) ) keep_api_key = os.environ.get("KEEP_READ_ONLY_BYPASS_KEY") + keep_sleep_interval = int(os.environ.get("KEEP_SLEEP_INTERVAL", 5)) if keep_api_key is None: with get_session_sync() as session: keep_api_key = get_or_create_api_key( @@ -429,7 +430,7 @@ def launch_demo_mode(use_thread: bool = True): kwargs={ "keep_api_key": keep_api_key, "keep_api_url": keep_api_url, - "sleep_interval": 0.2, + "sleep_interval": keep_sleep_interval, "demo_correlation_rules": True, }, ) @@ -439,7 +440,7 @@ def launch_demo_mode(use_thread: bool = True): simulate_alerts( keep_api_key=keep_api_key, keep_api_url=keep_api_url, - sleep_interval=0.2, + sleep_interval=keep_sleep_interval, demo_correlation_rules=True, ) logger.info("Demo mode initialized.") From d11b44c919147589fb9f75a949fddd24995da31b Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Tue, 19 Nov 2024 17:56:51 +0200 Subject: [PATCH 25/28] Providers --- keep/api/config.py | 5 ----- keep/api/core/demo_mode_runner.py | 24 +++++++++++++++++++++--- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/keep/api/config.py b/keep/api/config.py index 69273e3ca..474206812 100644 --- a/keep/api/config.py +++ b/keep/api/config.py @@ -58,11 +58,6 @@ def on_starting(server=None): logger.info("Keep server started") - -def when_ready(server): - """This function is called by the gunicorn server when it is ready to accept requests""" - logger.info("Keep server ready") - launch_uptime_reporting() if LIVE_DEMO_MODE: diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index b158bd591..cd6ad212a 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -299,6 +299,14 @@ def remove_old_incidents(keep_api_key, keep_api_url): response.raise_for_status() +def get_existing_installed_providers(keep_api_key, keep_api_url): + response = requests.get( + f"{keep_api_url}/providers", + headers={"x-api-key": keep_api_key}, + ) + response.raise_for_status() + return response.json()['installed_providers'] + def simulate_alerts( keep_api_url=None, keep_api_key=None, @@ -307,6 +315,7 @@ def simulate_alerts( demo_topology=False, ): logger.info("Simulating alerts...") + GENERATE_DEDUPLICATIONS = True providers = [ @@ -336,6 +345,12 @@ def simulate_alerts( logger.info("Demo thread: API is not up yet. Waiting...") time.sleep(5) + existing_installed_providers = get_existing_installed_providers(keep_api_key, keep_api_url) + existing_providers_to_their_ids = {} + for existing_provider in existing_installed_providers: + if existing_provider['type'] in providers: + existing_providers_to_their_ids[existing_provider['type']] = existing_provider['id'] + if demo_correlation_rules: logger.info("Creating correlation rules...") get_or_create_correlation_rules(keep_api_key, keep_api_url) @@ -353,14 +368,16 @@ def simulate_alerts( remove_old_incidents(keep_api_key, keep_api_url) logger.info("Old incidents removed.") + send_alert_url_params = {} + # choose provider provider_type = random.choice(providers) send_alert_url = "{}/alerts/event/{}".format(keep_api_url, provider_type) + if provider_type in existing_providers_to_their_ids: + send_alert_url_params["provider_id"] = existing_providers_to_their_ids[provider_type] provider = provider_classes[provider_type] alert = provider.simulate_alert() - send_alert_url_params = {} - if provider_type in providers_to_randomize_fingerprint_for: send_alert_url_params["fingerprint"] = "".join( random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=10) @@ -429,8 +446,9 @@ def launch_demo_mode(): kwargs={ "keep_api_key": keep_api_key, "keep_api_url": keep_api_url, - "sleep_interval": 0.2, + "sleep_interval": 5, "demo_correlation_rules": True, + "demo_topology": True, }, ) thread.daemon = True From bfdd5769b16873c1a947830f7606ed0b20aeb85e Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Tue, 19 Nov 2024 18:14:29 +0200 Subject: [PATCH 26/28] customer_id:acme --- keep/providers/prometheus_provider/alerts_mock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keep/providers/prometheus_provider/alerts_mock.py b/keep/providers/prometheus_provider/alerts_mock.py index 3c03bd675..d29197074 100644 --- a/keep/providers/prometheus_provider/alerts_mock.py +++ b/keep/providers/prometheus_provider/alerts_mock.py @@ -44,7 +44,7 @@ }, "network_latency_high": { "payload": { - "summary": "Network latency is higher than normal", + "summary": "Network latency is higher than normal for customer_id:acme", "labels": { "severity": "info", }, From 5ce36297832a4e13cf6bc494aee4dbec02e8ff26 Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Wed, 20 Nov 2024 10:33:28 +0200 Subject: [PATCH 27/28] Fix provider reading --- keep/api/core/demo_mode_runner.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index baf460d55..a004c5716 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -373,8 +373,10 @@ def simulate_alerts( # choose provider provider_type = random.choice(providers) send_alert_url = "{}/alerts/event/{}".format(keep_api_url, provider_type) + if provider_type in existing_providers_to_their_ids: send_alert_url_params["provider_id"] = existing_providers_to_their_ids[provider_type] + provider = provider_classes[provider_type] alert = provider.simulate_alert() @@ -392,7 +394,8 @@ def simulate_alerts( logger.info("Sending alert: {}".format(alert)) try: env = random.choice(["production", "staging", "development"]) - send_alert_url_params["provider_id"] = f"{provider_type}-{env}" + if not "provider_id" in send_alert_url_params: + send_alert_url_params["provider_id"] = f"{provider_type}-{env}" prepared_request = PreparedRequest() prepared_request.prepare_url(send_alert_url, send_alert_url_params) response = requests.post( From ddcfcdf4705b13c5331d66f3f361fbfbe22903ed Mon Sep 17 00:00:00 2001 From: Matvey Kukuy Date: Wed, 20 Nov 2024 11:03:54 +0200 Subject: [PATCH 28/28] Debug logs --- keep/api/core/demo_mode_runner.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/keep/api/core/demo_mode_runner.py b/keep/api/core/demo_mode_runner.py index a004c5716..095c02f6c 100644 --- a/keep/api/core/demo_mode_runner.py +++ b/keep/api/core/demo_mode_runner.py @@ -17,11 +17,18 @@ from keep.api.tasks.process_topology_task import process_topology from keep.api.utils.tenant_utils import get_or_create_api_key from keep.providers.providers_factory import ProvidersFactory +# import json logging.config.dictConfig(CONFIG) logger = logging.getLogger(__name__) +# file_path = '/Users/matvey/Desktop/keep-oss/keep/pr.json' +# def read_json_file(file_path): +# with open(file_path, 'r') as file: +# return json.load(file) +# pr_json = read_json_file(file_path) + correlation_rules_to_create = [ { "sqlQuery": {"sql": "((name like :name_1))", "params": {"name_1": "%mq%"}}, @@ -346,11 +353,15 @@ def simulate_alerts( time.sleep(5) existing_installed_providers = get_existing_installed_providers(keep_api_key, keep_api_url) + # existing_installed_providers = pr_json['installed_providers'] + logger.info(f"Existing installed providers: {existing_installed_providers}") existing_providers_to_their_ids = {} for existing_provider in existing_installed_providers: if existing_provider['type'] in providers: existing_providers_to_their_ids[existing_provider['type']] = existing_provider['id'] + logger.info(f"Existing installed existing_providers_to_their_ids: {existing_providers_to_their_ids}") + if demo_correlation_rules: logger.info("Creating correlation rules...") get_or_create_correlation_rules(keep_api_key, keep_api_url) @@ -376,6 +387,7 @@ def simulate_alerts( if provider_type in existing_providers_to_their_ids: send_alert_url_params["provider_id"] = existing_providers_to_their_ids[provider_type] + logger.info(f"Provider type: {provider_type}, send_alert_url_params now are: {send_alert_url_params}") provider = provider_classes[provider_type] alert = provider.simulate_alert() @@ -398,6 +410,7 @@ def simulate_alerts( send_alert_url_params["provider_id"] = f"{provider_type}-{env}" prepared_request = PreparedRequest() prepared_request.prepare_url(send_alert_url, send_alert_url_params) + logger.info(f"Sending alert to {prepared_request.url} with url params {send_alert_url_params}") response = requests.post( prepared_request.url, headers={"x-api-key": keep_api_key},