Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Demo mode #2523

Closed
wants to merge 33 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
e5001e9
Demo mode
Matvey-Kuk Nov 18, 2024
31a3e88
Docs
Matvey-Kuk Nov 18, 2024
be886ff
Merge branch 'main' into Matvey-Kuk/demo_mode
Matvey-Kuk Nov 18, 2024
c682491
Fix
Matvey-Kuk Nov 18, 2024
c63e469
up
Matvey-Kuk Nov 18, 2024
7daf2ce
minor
Matvey-Kuk Nov 18, 2024
9b8c52b
fix
Matvey-Kuk Nov 18, 2024
5bc2ed9
Read or Create topology
Matvey-Kuk Nov 18, 2024
294a792
Fingerprint randomization
Matvey-Kuk Nov 18, 2024
d045a28
Merge branch 'main' into Matvey-Kuk/demo_mode
Matvey-Kuk Nov 18, 2024
42cbb09
conditional import
Matvey-Kuk Nov 18, 2024
b34c1df
Update keep/api/config.py
Matvey-Kuk Nov 18, 2024
ac3b392
Update docs/deployment/configuration.mdx
Matvey-Kuk Nov 18, 2024
fe22a75
Merge branch 'main' into Matvey-Kuk/demo_mode
Matvey-Kuk Nov 18, 2024
b8edba1
Fix
Matvey-Kuk Nov 18, 2024
a93fa0c
Merge remote-tracking branch 'refs/remotes/origin/Matvey-Kuk/demo_mod…
Matvey-Kuk Nov 18, 2024
ce121da
Up
Matvey-Kuk Nov 18, 2024
89faba0
More logs
Matvey-Kuk Nov 18, 2024
3fe4071
Waiting for API to get live
Matvey-Kuk Nov 18, 2024
0e329b2
when_ready
Matvey-Kuk Nov 19, 2024
d96465b
Fix
Matvey-Kuk Nov 19, 2024
0fe790d
fix: fix
talboren Nov 19, 2024
353e42c
fix: fix
talboren Nov 19, 2024
e333e9e
fix: fix
talboren Nov 19, 2024
2256347
wait for the connection
Matvey-Kuk Nov 19, 2024
baacc13
fix: fix
talboren Nov 19, 2024
c1dd51a
fix: fix
talboren Nov 19, 2024
1e83a5c
fix: timeout
talboren Nov 19, 2024
d11b44c
Providers
Matvey-Kuk Nov 19, 2024
bfdd576
customer_id:acme
Matvey-Kuk Nov 19, 2024
d3ece2a
Merge remote-tracking branch 'refs/remotes/origin/Matvey-Kuk/demo_mod…
Matvey-Kuk Nov 19, 2024
5ce3629
Fix provider reading
Matvey-Kuk Nov 20, 2024
ddcfcdf
Debug logs
Matvey-Kuk Nov 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/deployment/configuration.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ General configuration variables control the core behavior of the Keep server. Th
| **KEEP_API_URL** | Specifies the Keep API URL | No | Constructed from HOST and PORT | Valid URL |
| **KEEP_STORE_RAW_ALERTS** | Enables storing of raw alerts | No | "false" | "true" or "false" |
| **TENANT_CONFIGURATION_RELOAD_TIME** | Time in minutes to reload tenant configurations | No | 5 | Positive integer |
| **LIVE_DEMO_MODE** | Keep will simulate incoming alerts and other activity | No | "false" | "true" or "false" |
Matvey-Kuk marked this conversation as resolved.
Show resolved Hide resolved

### Logging and Environment
<Info>
Expand Down
2 changes: 2 additions & 0 deletions keep/api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from keep.api.api import AUTH_TYPE
from keep.api.core.db_on_start import migrate_db, try_create_single_tenant
from keep.api.core.report_uptime import launch_uptime_reporting
from keep.api.core.demo_mode_runner import launch_demo_mode
from keep.api.core.dependencies import SINGLE_TENANT_UUID
from keep.identitymanager.identitymanagerfactory import IdentityManagerTypes

Expand All @@ -20,6 +21,7 @@ def on_starting(server=None):

migrate_db()
launch_uptime_reporting()
launch_demo_mode()
Matvey-Kuk marked this conversation as resolved.
Show resolved Hide resolved

# Create single tenant if it doesn't exist
if AUTH_TYPE in [
Expand Down
185 changes: 185 additions & 0 deletions keep/api/core/demo_mode_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
import os
import requests
import asyncio
import logging
import threading
import random
import time, datetime
from datetime import timezone

from keep.api.core.db import get_session_sync
from keep.api.core.dependencies import SINGLE_TENANT_UUID
from keep.api.utils.tenant_utils import get_or_create_api_key
from keep.providers.providers_factory import ProvidersFactory

logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)

logger = logging.getLogger(__name__)

LIVE_DEMO_MODE = os.environ.get("LIVE_DEMO_MODE", "false").lower() == "true"

incidents = [
{"name": "Performance issue, CPU, DB, UI impacted", "severity": "critical"},
{
"name": "Message queue bucles up",
"severity": "warning",
},
]

correlation_rules_to_create = [
{
"sqlQuery": {
"sql": "((name like :name_1))",
"params": {
"name_1": "%mq%"
}
},
"groupDescription": "This rule groups all alerts related to MQ.",
"ruleName": "Message Queue Bucle Up",
Matvey-Kuk marked this conversation as resolved.
Show resolved Hide resolved
"celQuery": "(name.contains(\"mq\"))",
"timeframeInSeconds": 86400,
"timeUnit": "hours",
"groupingCriteria": [],
"requireApprove": False,
"resolveOn": "never"
},
{
"sqlQuery": {
"sql": "((name like :name_1) or (name = :name_2) or (name like :name_3))",
"params": {
"name_1": "%network_latency_high%",
"name_2": "high_cpu_usage",
"name_3": "%database_connection_failure%"
}
},
"groupDescription": "This rule groups alerts from multiple sources.",
"ruleName": "Application issue caused by DB load",
"celQuery": "(name.contains(\"network_latency_high\")) || (name == \"high_cpu_usage\") || (name.contains(\"database_connection_failure\"))",
"timeframeInSeconds": 86400,
"timeUnit": "hours",
"groupingCriteria": [],
"requireApprove": False,
"resolveOn": "never"
},
]


def get_or_create_correlation_rules(keep_api_key, keep_api_url):
correlation_rules_existing = requests.get(
f"{keep_api_url}/rules",
headers={"x-api-key": keep_api_key},
)
correlation_rules_existing.raise_for_status()
correlation_rules_existing = correlation_rules_existing.json()

if len(correlation_rules_existing) == 0:
for correlation_rule in correlation_rules_to_create:
response = requests.post(
f"{keep_api_url}/rules",
headers={"x-api-key": keep_api_key},
json=correlation_rule,
)
response.raise_for_status()


def remove_old_incidents(keep_api_key, keep_api_url):
incidents_existing = requests.get(
f"{keep_api_url}/incidents",
headers={"x-api-key": keep_api_key},
)
incidents_existing.raise_for_status()
incidents_existing = incidents_existing.json()['items']

for incident in incidents_existing:
if datetime.datetime.strptime(
incident["creation_time"], "%Y-%m-%dT%H:%M:%S.%f"
).replace(tzinfo=timezone.utc) < (datetime.datetime.now() - datetime.timedelta(minutes=30)).astimezone(timezone.utc):
incident_id = incident["id"]
response = requests.delete(
f"{keep_api_url}/incidents/{incident_id}",
headers={"x-api-key": keep_api_key},
)
response.raise_for_status()


async def simulate_alerts(keep_api_url=None, keep_api_key=None, sleep_interval=5, demo_correlation_rules=False):
GENERATE_DEDUPLICATIONS = True

providers = ["prometheus", "grafana"]

provider_classes = {
provider: ProvidersFactory.get_provider_class(provider)
for provider in providers
}

# Wait in the beginning because server may not be ready yet.
await asyncio.sleep(sleep_interval * 2)

get_or_create_correlation_rules(keep_api_key, keep_api_url)

while True:
await asyncio.sleep(sleep_interval)

remove_old_incidents(keep_api_key, keep_api_url)

# choose provider
provider_type = random.choice(providers)
send_alert_url = "{}/alerts/event/{}".format(
keep_api_url, provider_type)
provider = provider_classes[provider_type]
alert = provider.simulate_alert()

# Determine number of times to send the same alert
num_iterations = 1
if GENERATE_DEDUPLICATIONS:
num_iterations = random.randint(1, 3)

for _ in range(num_iterations):
logger.info("Sending alert: {}".format(alert))
try:
env = random.choice(["production", "staging", "development"])
response = requests.post(
send_alert_url + f"?provider_id={provider_type}-{env}",
headers={"x-api-key": keep_api_key},
json=alert,
)
response.raise_for_status() # Raise an HTTPError for bad responses
except requests.exceptions.RequestException as e:
logger.error("Failed to send alert: {}".format(e))
time.sleep(sleep_interval)
continue

if response.status_code != 202:
logger.error("Failed to send alert: {}".format(response.text))
else:
logger.info("Alert sent successfully")

def launch_demo_mode():
"""
Running async demo in the backgound.
"""
keep_api_url = "http://localhost:" + str(os.environ.get("PORT", 8080))
keep_api_key = get_or_create_api_key(
Matvey-Kuk marked this conversation as resolved.
Show resolved Hide resolved
session=get_session_sync(),
tenant_id=SINGLE_TENANT_UUID,
created_by="system",
unique_api_key_id="simulate_alerts",
system_description="Simulate Alerts API key",
)
sleep_interval = 5

if LIVE_DEMO_MODE:
thread = threading.Thread(target=asyncio.run, args=(simulate_alerts(
keep_api_url,
keep_api_key,
sleep_interval,
demo_correlation_rules=True
), ))
thread.start()
logger.info("Simulate Alert launched.")
else:
logger.info("Alert simulation is disabled.")
62 changes: 11 additions & 51 deletions scripts/simulate_alerts.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@
import logging
import os
import random
import time

import requests
import logging
import asyncio

from keep.providers.providers_factory import ProvidersFactory
from keep.api.core.demo_mode_runner import simulate_alerts

# configure logging
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
Expand All @@ -17,53 +13,17 @@
logger = logging.getLogger(__name__)


def main():
GENERATE_DEDUPLICATIONS = True
async def main():
SLEEP_INTERVAL = float(os.environ.get("SLEEP_INTERVAL", 0.2)) # Configurable sleep interval from env variable
keep_api_key = os.environ.get("KEEP_API_KEY")
keep_api_url = os.environ.get("KEEP_API_URL") or "http://localhost:8080"
if keep_api_key is None or keep_api_url is None:
raise Exception("KEEP_API_KEY and KEEP_API_URL must be set")

providers = ["prometheus", "grafana"]
provider_classes = {
provider: ProvidersFactory.get_provider_class(provider)
for provider in providers
}
while True:
# choose provider
provider_type = random.choice(providers)
send_alert_url = "{}/alerts/event/{}".format(keep_api_url, provider_type)
provider = provider_classes[provider_type]
alert = provider.simulate_alert()

# Determine number of times to send the same alert
num_iterations = 1
if GENERATE_DEDUPLICATIONS:
num_iterations = random.randint(1, 3)

for _ in range(num_iterations):
logger.info("Sending alert: {}".format(alert))
try:
env = random.choice(["production", "staging", "development"])
response = requests.post(
send_alert_url + f"?provider_id={provider_type}-{env}",
headers={"x-api-key": keep_api_key},
json=alert,
)
response.raise_for_status() # Raise an HTTPError for bad responses
except requests.exceptions.RequestException as e:
logger.error("Failed to send alert: {}".format(e))
time.sleep(SLEEP_INTERVAL)
continue

if response.status_code != 202:
logger.error("Failed to send alert: {}".format(response.text))
else:
logger.info("Alert sent successfully")

time.sleep(SLEEP_INTERVAL) # Wait for the configured interval before sending the next alert
await simulate_alerts(
keep_api_key=keep_api_key,
keep_api_url=keep_api_url,
sleep_interval=SLEEP_INTERVAL,
demo_correlation_rules=False,
)


if __name__ == "__main__":
main()
asyncio.run(main())
Loading