Skip to content

Commit

Permalink
Merge branch 'main' into feat-e2e-tests
Browse files Browse the repository at this point in the history
  • Loading branch information
35C4n0r authored Jan 31, 2025
2 parents 05a801c + 418d0e6 commit 94c2786
Showing 136 changed files with 7,930 additions and 1,520 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/auto-resolve-keep.yml
Original file line number Diff line number Diff line change
@@ -61,7 +61,7 @@ jobs:
url: "https://api.keephq.dev/incidents/${{ steps.set_ids.outputs.final_incident_id }}/enrich"
method: "POST"
customHeaders: '{"X-API-KEY": "${{ secrets.KEEP_API_KEY }}", "Content-Type": "application/json"}'
data: '{"enrichments":{"incident_title":"${{ github.event.pull_request.title || ''Manual resolution'' }}","incident_url":"${{ github.event.pull_request.html_url || github.server_url }}//${{ github.repository }}/actions/runs/${{ github.run_id }}", "incident_id": "${{ github.run_id }}", "incident_provider": "github"}}'
data: '{"enrichments":{"incident_title":"${{ github.event.pull_request.title || ''Manual resolution'' }}","incident_url":"${{ github.event.pull_request.html_url || format(''{0}/{1}/actions/runs/{2}'', github.server_url, github.repository, github.run_id) }}", "incident_id": "${{ github.run_id }}", "incident_provider": "github"}}'

- name: Auto resolve Keep alert
if: |
@@ -72,4 +72,4 @@ jobs:
url: "https://api.keephq.dev/alerts/enrich?dispose_on_new_alert=true"
method: "POST"
customHeaders: '{"Content-Type": "application/json", "X-API-KEY": "${{ secrets.KEEP_API_KEY }}"}'
data: '{"enrichments":{"status":"${{ inputs.status || ''resolved'' }}","dismissed":false,"dismissUntil":"","note":"${{ github.event.pull_request.title || ''Manual resolution'' }}","ticket_url":"${{ github.event.pull_request.html_url || github.server_url }}//${{ github.repository }}/actions/runs/${{ github.run_id }}"},"fingerprint":"${{ steps.set_ids.outputs.final_alert_fingerprint }}"}'
data: '{"enrichments":{"status":"${{ inputs.status || ''resolved'' }}","dismissed":false,"dismissUntil":"","note":"${{ github.event.pull_request.title || ''Manual resolution'' }}","ticket_url":"${{ github.event.pull_request.html_url || format(''{0}/{1}/actions/runs/{2}'', github.server_url, github.repository, github.run_id) }}"},"fingerprint":"${{ steps.set_ids.outputs.final_alert_fingerprint }}"}'
6 changes: 3 additions & 3 deletions .github/workflows/test-pr.yml
Original file line number Diff line number Diff line change
@@ -92,7 +92,7 @@ jobs:

- name: Run unit tests and report coverage
env:
LOG_LEVEL: DEBUG
# LOG_LEVEL: DEBUG
SQLALCHEMY_WARN_20: 1
run: |
poetry run coverage run --branch -m pytest --timeout 20 -n auto --non-integration --ignore=tests/e2e_tests/
@@ -108,8 +108,8 @@ jobs:
poetry run coverage run --branch -m pytest --integration --ignore=tests/e2e_tests/
- name: Convert coverage results to JSON (for CodeCov support)
env:
LOG_LEVEL: DEBUG
# env:
# LOG_LEVEL: DEBUG
run: poetry run coverage json --omit="keep/providers/*"

- name: Upload coverage reports to Codecov
73 changes: 65 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -62,6 +62,57 @@
> Missing a provider? [Submit a new provider request](https://github.com/keephq/keep/issues/new?assignees=&labels=provider&projects=&template=new_provider_request.md&title=) and we'll add it quickly!
### AI Backends for Enrichments, Correlations and Incident Context Gathering

<table>
<tr>
<td align="center" width="150">
<a href="https://docs.keephq.dev/providers/documentation/anthropic-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/anthropic-icon.png" alt="Anthropic"/><br/>
Anthropic
</a>
</td>
<td align="center" width="150">
<a href="https://docs.keephq.dev/providers/documentation/openai-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/openai-icon.png" alt="OpenAI"/><br/>
OpenAI
</a>
</td>
<td align="center" width="150">
<a href="https://docs.keephq.dev/providers/documentation/deepseek-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/deepseek-icon.png" alt="DeepSeek"/><br/>
DeepSeek
</a>
</td>
<td align="center" width="150">
<a href="https://docs.keephq.dev/providers/documentation/ollama-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/ollama-icon.png" alt="Ollama"/><br/>
Ollama
</a>
</td>
<td align="center" width="150">
<a href="https://docs.keephq.dev/providers/documentation/llamacpp-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/llamacpp-icon.png" alt="LlamaCPP"/><br/>
LlamaCPP
</a>
</td>
<td align="center" width="150">
<a href="https://docs.keephq.dev/providers/documentation/grok-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/grok-icon.png" alt="Grok"/><br/>
Grok
</a>
</td>
</tr>
<tr>
<td align="center" width="150">
<a href="https://docs.keephq.dev/providers/documentation/gemini-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/gemini-icon.png" alt="Gemini"/><br/>
Gemini
</a>
</td>
</tr>
</table>

### Observability Tools

<table>
@@ -110,6 +161,12 @@
Cilium
</a>
</td>
<td align="center" width="150">
<a href="https://docs.keephq.dev/providers/documentation/checkly-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/checkly-icon.png" alt="Checkly"/><br/>
Checkly
</a>
</td>
<td align="center" width="150">
<a href="https://docs.keephq.dev/providers/documentation/cloudwatch-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/cloudwatch-icon.png" alt="CloudWatch"/><br/>
@@ -134,14 +191,14 @@
Dynatrace
</a>
</td>
</tr>
<tr>
<td align="center">
<a href="https://docs.keephq.dev/providers/documentation/elastic-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/elastic-icon.png" alt="Elastic"/><br/>
Elastic
</a>
</td>
</tr>
<tr>
<td align="center">
<a href="https://docs.keephq.dev/providers/documentation/gcpmonitoring-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/gcpmonitoring-icon.png" alt="GCP Monitoring"/><br/>
@@ -172,14 +229,14 @@
Netdata
</a>
</td>
</tr>
<tr>
<td align="center" width="150">
<a href="https://docs.keephq.dev/providers/documentation/new-relic-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/newrelic-icon.png" alt="New Relic"/><br/>
New Relic
</a>
</td>
</tr>
<tr>
<td align="center" width="150">
<a href="https://docs.keephq.dev/providers/documentation/parseable-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/parseable-icon.png" alt="Parseable"/><br/>
@@ -210,14 +267,14 @@
Sentry
</a>
</td>
</tr>
<tr>
<td align="center" width="150">
<a href="https://docs.keephq.dev/providers/documentation/signalfx-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/signalfx-icon.png" alt="SignalFX"/><br/>
SignalFX
</a>
</td>
</tr>
<tr>
<td align="center" width="150">
<a href="https://docs.keephq.dev/providers/documentation/openobserve-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/openobserve-icon.png" alt="OpenObserve"/><br/>
@@ -248,14 +305,14 @@
SumoLogic
</a>
</td>
</tr>
<tr>
<td align="center" width="150">
<a href="https://docs.keephq.dev/providers/documentation/uptimekuma-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/uptimekuma-icon.png" alt="UptimeKuma"/><br/>
UptimeKuma
</a>
</td>
</tr>
<tr>
<td align="center" width="150">
<a href="https://docs.keephq.dev/providers/documentation/victoriametrics-provider" target="_blank">
<img width="40" src="keep-ui/public/icons/victoriametrics-icon.png" alt="VictoriaMetrics"/><br/>
67 changes: 67 additions & 0 deletions docs/alertevaluation/examples/victoriametricsmulti.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
---
title: "VictoriaMetrics Multi Alert Example"
---

This example demonstrates a simple CPU usage multi-alert based on a metric:

```yaml
workflow:
# Unique identifier for this workflow
id: query-victoriametrics-multi
# Display name shown in the UI
name: victoriametrics-multi-alert-example
# Brief description of what this workflow does
description: victoriametrics
triggers:
# This workflow can be triggered manually from the UI
- type: manual
steps:
# Query VictoriaMetrics for CPU metrics
- name: victoriametrics-step
provider:
# Use the VictoriaMetrics provider configuration
config: "{{ providers.vm }}"
type: victoriametrics
with:
# Query that returns the sum of CPU usage for each job
# Example response:
# [
# {'metric': {'job': 'victoriametrics'}, 'value': [1737808021, '0.022633333333333307']},
# {'metric': {'job': 'vmagent'}, 'value': [1737808021, '0.009299999999999998']}
# ]
query: sum(rate(process_cpu_seconds_total)) by (job)
queryType: query

actions:
# Create an alert in Keep based on the query results
- name: create-alert
provider:
type: keep
with:
# Only create alert if CPU usage is above threshold
if: "{{ value.1 }} > 0.01 "
# Alert must persist for 1 minute
for: 1m
# Use job label to create unique fingerprint for each alert
fingerprint_fields:
- labels.job
alert:
# Alert name includes the specific job
name: "High CPU Usage on {{ metric.job }}"
description: "CPU usage is high on the VM (created from VM metric)"
# Set severity based on CPU usage thresholds:
# > 0.9 = critical
# > 0.7 = warning
# else = info
severity: '{{ value.1 }} > 0.9 ? "critical" : {{ value.1 }} > 0.7 ? "warning" : "info"'
labels:
# Job label is required for alert fingerprinting
job: "{{ metric.job }}"
# Additional context labels
environment: production
app: myapp
service: api
team: devops
owner: alice

```
53 changes: 53 additions & 0 deletions docs/alertevaluation/examples/victoriametricssingle.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
---
title: "VictoriaMetrics Single Alert Example"
---

This example demonstrates a simple CPU usage alert based on a metric:

```yaml
# This workflow queries VictoriaMetrics metrics and creates alerts based on CPU usage
workflow:
# Unique identifier for this workflow
id: query-victoriametrics
# Display name shown in the UI
name: victoriametrics-alert-example
# Brief description of what this workflow does
description: Monitors CPU usage metrics from VictoriaMetrics and creates alerts when thresholds are exceeded

# Define how the workflow is triggered
triggers:
- type: manual # Can be triggered manually from the UI

# Steps to execute in order
steps:
- name: victoriametrics-step
provider:
# Use VictoriaMetrics provider config defined in providers.vm
config: "{{ providers.vm }}"
type: victoriametrics
with:
# Query average CPU usage rate
query: avg(rate(process_cpu_seconds_total))
queryType: query

# Actions to take based on the query results
actions:
- name: create-alert
provider:
type: keep
with:
# Create alert if CPU usage exceeds threshold
if: "{{ value.1 }} > 0.0040"
alert:
name: "High CPU Usage"
description: "[Single] CPU usage is high on the VM (created from VM metric)"
# Set severity based on CPU usage thresholds
severity: '{{ value.1 }} > 0.9 ? "critical" : {{ value.1 }} > 0.7 ? "warning" : "info"'
# Alert labels for filtering and routing
labels:
environment: production
app: myapp
service: api
team: devops
owner: alice
```
52 changes: 52 additions & 0 deletions docs/alertevaluation/overview.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
---
title: "Overview"
---

The Keep Alert Evaluation Engine is a flexible system that enables you to create alerts based on any data source and define evaluation rules. Unlike traditional monitoring solutions that are tied to specific metrics, Keep's engine allows you to combine data from multiple sources and apply complex logic to determine when and how alerts should be triggered.

## Core Features

### Generic Data Source Support
- Query any data source (databases, APIs, metrics systems)
- Combine multiple data sources in a single alert rule
- Apply custom transformations to the data

### Flexible Alert Evaluation
- Define custom conditions using templated expressions
- Support for complex boolean logic and mathematical operations
- State management for alert transitions (pending->firing->resolved)
- Deduplication and alert instance tracking

### Customizable Alert Definition
- Full control over alert metadata (name, description, severity)
- Dynamic labels based on evaluation context
- Template support for all alert fields
- Custom fingerprinting for alert grouping

## Core Components

### Alert States
- **Pending**: Initial state when alert condition is met (relevant only if `for` supplied)
- **Firing**: Active alert that has met its duration condition
- **Resolved**: Alert that is no longer active

### Alert Rule Components
1. **Data Collection**: Query steps to gather data from any source
2. **Condition (`if`)**: Expression that determines when to create/update an alert
3. **Duration (`for`)**: Optional time period the condition must be true before firing
4. **Alert Definition**: Complete control over how the alert looks and behaves:
- Name and description
- Severity levels
- Labels for routing
- Custom fields and annotations

### State Management
- **Fingerprinting**: Unique identifier for alert deduplication and state tracking
- **Keep-Firing**: Control how long alerts remain active
- **State Transitions**: Rules for how alerts move between states

## Examples
The following examples demonstrate different ways to use the alert evaluation engine:

- [Single Metric Alert](/alertevaluation/examples/victoriametricssingle) - Basic example showing metrics-based alerting
- [Multiple Metrics Alert](/alertevaluation/examples/victoriametricsmulti) - Advanced example with multiple alert instances
Binary file added docs/images/checkly-provider_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/checkly-provider_10.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/checkly-provider_11.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/checkly-provider_12.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/checkly-provider_2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/checkly-provider_3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/checkly-provider_4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/checkly-provider_5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/checkly-provider_6.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/checkly-provider_7.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/checkly-provider_8.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/checkly-provider_9.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 16 additions & 0 deletions docs/mint.json
Original file line number Diff line number Diff line change
@@ -103,6 +103,19 @@
}
]
},
{
"group": "Alert Evaluation Engine",
"pages": [
"alertevaluation/overview",
{
"group": "Examples",
"pages": [
"alertevaluation/examples/victoriametricssingle",
"alertevaluation/examples/victoriametricsmulti"
]
}
]
},
{
"group": "Providers",
"pages": [
@@ -124,14 +137,17 @@
"providers/documentation/bigquery-provider",
"providers/documentation/centreon-provider",
"providers/documentation/checkmk-provider",
"providers/documentation/checkly-provider",
"providers/documentation/cilium-provider",
"providers/documentation/clickhouse-provider",
"providers/documentation/cloudwatch-provider",
"providers/documentation/console-provider",
"providers/documentation/coralogix-provider",
"providers/documentation/datadog-provider",
"providers/documentation/deepseek-provider",
"providers/documentation/discord-provider",
"providers/documentation/dynatrace-provider",
"providers/documentation/eks-provider",
"providers/documentation/elastic-provider",
"providers/documentation/gcpmonitoring-provider",
"providers/documentation/gemini-provider",
6 changes: 3 additions & 3 deletions docs/overview/introduction.mdx
Original file line number Diff line number Diff line change
@@ -41,6 +41,6 @@ Our vision is to democratize AIOps, making it accessible and practical for teams

## What you should read next

- [Key Concepts](#key-concepts): Understand the foundational ideas behind Keep.
- [Use Cases](#use-cases): Learn how Keep can solve specific IT operations challenges.
- [Getting Started](#getting-started): Dive in and start using Keep today.
- [Key Concepts](/overview/glossary): Understand the foundational ideas behind Keep.
- [Use Cases](/overview/usecases): Learn how Keep can solve specific IT operations challenges.
- [Playground](/overview/playground): Explore Keep's playground.
Loading

0 comments on commit 94c2786

Please sign in to comment.