diff --git a/.gitignore b/.gitignore index 38d405e05..8b8a8989c 100644 --- a/.gitignore +++ b/.gitignore @@ -216,3 +216,6 @@ keepnew.db providers_cache.json tests/provision/* +grafana/* +!grafana/provisioning/ +!grafana/dashboards/ \ No newline at end of file diff --git a/STRESS.md b/STRESS.md deleted file mode 100644 index dd248cc22..000000000 --- a/STRESS.md +++ /dev/null @@ -1,58 +0,0 @@ - -# UNDER CONSTRUCTION - -# First, create a Kubernetes cluster - - -# Install Keep -gcloud config set project keep-dev-429814 -gcloud container clusters get-credentials keep-stress --zone us-central1-c --project keep-dev-429814 -helm repo add keephq https://keephq.github.io/helm-charts -helm pull keephq/keep -# create the namespace -kubectl create namespace keep -# install keep -helm install keep keephq/keep --namespace keep -# from local -helm install keep ./charts/keep --namespace keep - -kubectl -n keep describe pod keep-backend-697f6b946f-v2jxp -kubectl -n keep logs keep-frontend-577fdf5497-r8ht9 -# Import alerts - -# uninstall -helm uninstall keep --namespace keep - -kubectl -n keep exec -it keep-backend-64c4d7ddb7-7p5q5 /bin/bash -# copy the db -kubectl -n keep exec -it keep-database-86dd6b6775-92sz4 /bin/bash -kubectl -n keep cp ./keep.sql keep-database-659c69689-vxhkz:/tmp/keep.sql -kubectl -n keep exec -it keep-database-659c69689-vxhkz -- bash -c "mysql -u root keep < /tmp/keep.sql" -# exec into the pod -kubectl -n keep exec -it keep-database-86dd6b6775-92sz4 -- /bin/bash -# import -kubectl -n keep exec -it keep-database-659c69689-vxhkz -- bash -c "mysql -u root keep < /tmp/keep.sql" - -# No Load -## 500k alerts - 1Gi/250m cpu: get_last_alerts 2 minutes and 30 seconds -Keep Backend Workers get a timeout after one minute (status code 500 for preset and alert endpoints) -## 500k alerts - 2Gi/500m cpu: -- default mysql: get_last_alerts 1 minutes and 30 seconds -- innodb_buffer_pool_size = 4294967296: 25 seconds, 3 seconds after cache -## 500k alerts - 4Gi/1 cpu: get_last_alerts 2 minutes and 30 seconds -- -## 500k alerts - 8Gi/1 cpu: get_last_alerts 2 minutes and 30 seconds - -# Load 10 alerts per minute - -# Load 100 alerts per minute - -# Load 1000 alerts per minute - - -## 1M alerts -# Load 10 alerts per minute - -# Load 100 alerts per minute - -# Load 1000 alerts per minute diff --git a/docker-compose.yml b/docker-compose.yml index 68291e6b6..14b6001ad 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -26,3 +26,33 @@ services: extends: file: docker-compose.common.yml service: keep-websocket-server-common + + grafana: + image: grafana/grafana:latest + profiles: + - grafana + ports: + - "3001:3000" + volumes: + - ./grafana:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning + - ./grafana/dashboards:/etc/grafana/dashboards + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + depends_on: + - prometheus + + prometheus: + image: prom/prometheus:latest + profiles: + - grafana + ports: + - "9090:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + command: + - "--config.file=/etc/prometheus/prometheus.yml" + depends_on: + - keep-backend diff --git a/grafana/dashboards/keep.json b/grafana/dashboards/keep.json new file mode 100644 index 000000000..a94725b1f --- /dev/null +++ b/grafana/dashboards/keep.json @@ -0,0 +1,737 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "title": "Request Duration by Endpoint", + "type": "timeseries", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(keep_http_request_duration_seconds_sum{handler!=\"none\"}[5m]) / rate(keep_http_request_duration_seconds_count{handler!=\"none\"}[5m])", + "legendFormat": "{{handler}}" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(keep_running_tasks_current)", + "refId": "A" + } + ], + "title": "Running Tasks", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(keep_http_requests_total{status=~\"2..\"}[5m])) by (handler)", + "legendFormat": "{{handler}}", + "refId": "A" + } + ], + "title": "Request Rate by Endpoint (2xx)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(keep_events_in_total[5m])", + "legendFormat": "Events In", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(keep_events_processed_total[5m])", + "legendFormat": "Events Processed", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(keep_events_error_total[5m])", + "legendFormat": "Events Error", + "refId": "C" + } + ], + "title": "Events Processing Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 5, + "title": "Workflow Execution Duration", + "type": "timeseries", + "targets": [ + { + "expr": "rate(keep_workflows_execution_duration_seconds_sum[5m]) / rate(keep_workflows_execution_duration_seconds_count[5m])", + "legendFormat": "{{workflow_id}}" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 6, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "title": "Workflow Queue Size", + "type": "gauge", + "targets": [ + { + "expr": "keep_workflows_queue_size", + "legendFormat": "{{tenant_id}}" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 7, + "title": "Workflow Executions", + "type": "timeseries", + "targets": [ + { + "expr": "rate(keep_workflows_executions_total[5m])", + "legendFormat": "{{workflow_id}} ({{trigger_type}})" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": ["lastNotNull"], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "keep_events_in_total", + "legendFormat": "Total Events In", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "keep_events_processed_total", + "legendFormat": "Total Events Processed", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "keep_events_error_total", + "legendFormat": "Total Events Error", + "refId": "C" + } + ], + "title": "Total Events", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 9, + "options": { + "legend": { + "calcs": ["lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "keep_workflows_executions_total", + "legendFormat": "{{workflow_id}} ({{trigger_type}})", + "refId": "A" + } + ], + "title": "Total Workflow Executions", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["keep"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Keep Dashboard", + "uid": "keep", + "version": 1, + "weekStart": "" +} diff --git a/grafana/provisioning/dashboards/keep.yml b/grafana/provisioning/dashboards/keep.yml new file mode 100644 index 000000000..6213d6185 --- /dev/null +++ b/grafana/provisioning/dashboards/keep.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: "Keep" + orgId: 1 + folder: "" + type: file + disableDeletion: false + editable: true + options: + path: /etc/grafana/dashboards diff --git a/grafana/provisioning/datasources/prometheus.yml b/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 000000000..a221c3c37 --- /dev/null +++ b/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true diff --git a/keep/api/api.py b/keep/api/api.py index 7be28833a..ae1a44ac9 100644 --- a/keep/api/api.py +++ b/keep/api/api.py @@ -320,7 +320,9 @@ async def catch_exception(request: Request, exc: Exception): app.add_middleware(LoggingMiddleware) if config("KEEP_METRICS", default="true", cast=bool): - Instrumentator().instrument(app=app, metric_namespace="keep") + Instrumentator( + excluded_handlers=["/metrics", "/metrics/processing"] + ).instrument(app=app, metric_namespace="keep") keep.api.observability.setup(app) return app diff --git a/keep/providers/cilium_provider/cilium_provider.py b/keep/providers/cilium_provider/cilium_provider.py index e72d1c238..400133827 100644 --- a/keep/providers/cilium_provider/cilium_provider.py +++ b/keep/providers/cilium_provider/cilium_provider.py @@ -7,9 +7,6 @@ from keep.api.models.db.topology import TopologyServiceInDto from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseTopologyProvider -from keep.providers.cilium_provider.grpc.observer_pb2 import (FlowFilter, - GetFlowsRequest) -from keep.providers.cilium_provider.grpc.observer_pb2_grpc import ObserverStub from keep.providers.models.provider_config import ProviderConfig from keep.validation.fields import NoSchemeUrl @@ -24,7 +21,7 @@ class CiliumProviderAuthConfig: "description": "The base endpoint of the cilium hubble relay", "sensitive": False, "hint": "localhost:4245", - "validation": "no_scheme_url" + "validation": "no_scheme_url", } ) @@ -82,6 +79,15 @@ def _get_service_name(self, endpoint) -> str: return service def pull_topology(self) -> list[TopologyServiceInDto]: + # for some providers that depends on grpc like cilium provider, this might fail on imports not from Keep (such as the docs script) + from keep.providers.cilium_provider.grpc.observer_pb2 import ( # noqa + FlowFilter, + GetFlowsRequest, + ) + from keep.providers.cilium_provider.grpc.observer_pb2_grpc import ( # noqa + ObserverStub, + ) + channel = grpc.insecure_channel(self.authentication_config.cilium_base_endpoint) stub = ObserverStub(channel) diff --git a/keep/providers/providers_factory.py b/keep/providers/providers_factory.py index 3a2ec4131..2cea90713 100644 --- a/keep/providers/providers_factory.py +++ b/keep/providers/providers_factory.py @@ -414,9 +414,9 @@ def get_all_providers(ignore_cache_file: bool = False) -> list[Provider]: ) continue # for some providers that depends on grpc like cilium provider, this might fail on imports not from Keep (such as the docs script) - except TypeError: + except TypeError as e: logger.warning( - f"Cannot import provider {provider_directory}, unexpected error." + f"Cannot import provider {provider_directory}, unexpected error. ({str(e)})" ) continue diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml new file mode 100644 index 000000000..c3a5987a0 --- /dev/null +++ b/prometheus/prometheus.yml @@ -0,0 +1,13 @@ +global: + scrape_interval: 5s + evaluation_interval: 5s + +scrape_configs: + - job_name: "keep" + static_configs: + - targets: ["keep-backend:8080"] + metrics_path: "/metrics/processing" + http_headers: + x-api-key: + values: + - "keep-api-key"