Skip to content

Commit

Permalink
feat: add topology based incidents
Browse files Browse the repository at this point in the history
  • Loading branch information
shahargl committed Dec 30, 2024
1 parent 3e3a0e6 commit 53d6ccd
Show file tree
Hide file tree
Showing 19 changed files with 769 additions and 95 deletions.
4 changes: 4 additions & 0 deletions keep-ui/app/(keep)/topology/api/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,7 @@ export async function getTopology(
const url = buildTopologyUrl({ providerIds, services, environment });
return await api.get<TopologyService[]>(url);
}

export async function pullTopology(api: ApiClient) {
return await api.post("/topology/pull");
}
39 changes: 37 additions & 2 deletions keep-ui/app/(keep)/topology/topology-client.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ import { ApplicationsList } from "./ui/applications/applications-list";
import { useContext, useEffect, useState } from "react";
import { TopologySearchContext } from "./TopologySearchContext";
import { TopologyApplication, TopologyService } from "./model";
import { Button } from "@/components/ui";
import { ArrowPathIcon } from "@heroicons/react/24/outline";
import { useApi } from "@/shared/lib/hooks/useApi";
import { pullTopology } from "./api";
import { toast } from "react-toastify";

export function TopologyPageClient({
applications,
Expand All @@ -16,6 +21,18 @@ export function TopologyPageClient({
}) {
const [tabIndex, setTabIndex] = useState(0);
const { selectedObjectId } = useContext(TopologySearchContext);
const api = useApi();

const handlePullTopology = async (e: React.MouseEvent) => {
e.stopPropagation();
try {
await pullTopology(api);
toast.success("Topology pull initiated");
} catch (error) {
toast.error("Failed to pull topology");
console.error("Failed to pull topology:", error);
}
};

useEffect(() => {
if (!selectedObjectId) {
Expand All @@ -32,8 +49,26 @@ export function TopologyPageClient({
onIndexChange={setTabIndex}
>
<TabList className="mb-2">
<Tab>Topology Map</Tab>
<Tab>Applications</Tab>
<Tab>
<div className="flex items-center gap-2 h-6">
<span className="inline-flex">
<Button
variant="secondary"
size="xs"
onClick={handlePullTopology}
title="Pull latest topology"
>
<ArrowPathIcon className="h-4 w-4" />
</Button>
</span>
<span>Topology Map</span>
</div>
</Tab>
<Tab>
<div className="flex items-center h-6">
<span>Applications</span>
</div>
</Tab>
</TabList>
<TabPanels className="flex-1 flex flex-col">
<TabPanel className="h-[calc(100vh-10rem)]">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@ export function getNodesAndEdgesFromTopologyData(
) {
const nodeMap = new Map<string, TopologyNode>();
const edgeMap = new Map<string, Edge>();
const allServices = topologyData.map((data) => data.display_name);

// Create nodes from service definitions
for (const service of topologyData) {
const numIncidentsToService = allIncidents.filter((incident) =>
incident.services.includes(service.display_name)
const numIncidentsToService = allIncidents.filter(
(incident) =>
incident.services.includes(service.display_name) ||
incident.services.includes(service.service)
);
const node: ServiceNodeType = {
id: service.service.toString(),
Expand Down
11 changes: 11 additions & 0 deletions keep/api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
IdentityManagerFactory,
IdentityManagerTypes,
)
from keep.topologies.topology_processor import TopologyProcessor

# load all providers into cache
from keep.workflowmanager.workflowmanager import WorkflowManager
Expand All @@ -76,6 +77,7 @@
PORT = config("PORT", default=8080, cast=int)
SCHEDULER = config("SCHEDULER", default="true", cast=bool)
CONSUMER = config("CONSUMER", default="true", cast=bool)
TOPOLOGY = config("KEEP_TOPOLOGY_PROCESSOR", default="false", cast=bool)
KEEP_DEBUG_TASKS = config("KEEP_DEBUG_TASKS", default="false", cast=bool)

AUTH_TYPE = config("AUTH_TYPE", default=IdentityManagerTypes.NOAUTH.value).lower()
Expand Down Expand Up @@ -142,6 +144,15 @@ async def startup():
logger.info("Consumer started successfully")
except Exception:
logger.exception("Failed to start the consumer")
# Start the topology processor
if TOPOLOGY:
try:
logger.info("Starting the topology processor")
topology_processor = TopologyProcessor.get_instance()
await topology_processor.start()
logger.info("Topology processor started successfully")
except Exception:
logger.exception("Failed to start the topology processor")

if KEEP_ARQ_TASK_POOL != KEEP_ARQ_TASK_POOL_NONE:
event_loop = asyncio.get_event_loop()
Expand Down
71 changes: 54 additions & 17 deletions keep/api/core/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -1816,13 +1816,43 @@ def create_incident_for_grouping_rule(
rule_id=rule.id,
rule_fingerprint=rule_fingerprint,
is_predicted=False,
is_confirmed=rule.create_on == CreateIncidentOn.ANY.value and not rule.require_approve,
is_confirmed=rule.create_on == CreateIncidentOn.ANY.value
and not rule.require_approve,
incident_type=IncidentType.RULE.value,
)
session.add(incident)
session.commit()
session.refresh(incident)
return incident


def create_incident_for_topology(
tenant_id: str, alert_group: list[Alert], session: Session
) -> Incident:
"""Create a new incident from topology-connected alerts"""
# Get highest severity from alerts
severity = max(alert.severity for alert in alert_group)

# Get all services
services = set()
service_names = set()
for alert in alert_group:
services.update(alert.service_ids)
service_names.update(alert.service_names)

incident = Incident(
tenant_id=tenant_id,
user_generated_name=f"Topology incident: Multiple alerts across {', '.join(service_names)}",
severity=severity.value,
status=IncidentStatus.FIRING.value,
is_confirmed=True,
incident_type=IncidentType.TOPOLOGY.value, # Set incident type for topology
data={"services": list(services), "alert_count": len(alert_group)},
)

return incident


def get_rule(tenant_id, rule_id):
with Session(engine) as session:
rule = session.exec(
Expand Down Expand Up @@ -1915,6 +1945,7 @@ def get_all_deduplication_rules(tenant_id):
).all()
return rules


def get_deduplication_rule_by_id(tenant_id, rule_id: str):
rule_uuid = __convert_to_uuid(rule_id)
if not rule_uuid:
Expand Down Expand Up @@ -1952,7 +1983,7 @@ def create_deduplication_rule(
full_deduplication: bool = False,
ignore_fields: list[str] = [],
priority: int = 0,
is_provisioned: bool = False
is_provisioned: bool = False,
):
with Session(engine) as session:
new_rule = AlertDeduplicationRule(
Expand Down Expand Up @@ -3403,15 +3434,20 @@ def create_incident_from_dto(
"assignee": incident_dto.assignee,
"is_predicted": False, # its not a prediction, but an AI generation
"is_confirmed": True, # confirmed by the user :)
"incident_type": IncidentType.AI.value,
}

elif issubclass(type(incident_dto), IncidentDto):
# we will reach this block when incident is pulled from a provider
incident_dict = incident_dto.to_db_incident().dict()

if "incident_type" not in incident_dict:
incident_dict["incident_type"] = IncidentType.MANUAL.value
else:
# We'll reach this block when a user creates an incident
incident_dict = incident_dto.dict()
# Keep existing incident_type if present, default to MANUAL if not
if "incident_type" not in incident_dict:
incident_dict["incident_type"] = IncidentType.MANUAL.value

return create_incident_from_dict(tenant_id, incident_dict)

Expand Down Expand Up @@ -3815,7 +3851,9 @@ def add_alerts_to_incident(
return incident


def get_incident_unique_fingerprint_count(tenant_id: str, incident_id: str | UUID) -> int:
def get_incident_unique_fingerprint_count(
tenant_id: str, incident_id: str | UUID
) -> int:
with Session(engine) as session:
return session.execute(
select(func.count(1))
Expand Down Expand Up @@ -4488,19 +4526,22 @@ def get_workflow_executions_for_incident_or_alert(
results = session.execute(final_query).all()
return results, total_count


def is_all_alerts_resolved(
fingerprints: Optional[List[str]] = None,
incident: Optional[Incident] = None,
session: Optional[Session] = None
session: Optional[Session] = None,
):
return is_all_alerts_in_status(fingerprints, incident, AlertStatus.RESOLVED, session)
return is_all_alerts_in_status(
fingerprints, incident, AlertStatus.RESOLVED, session
)


def is_all_alerts_in_status(
fingerprints: Optional[List[str]] = None,
incident: Optional[Incident] = None,
status: AlertStatus = AlertStatus.RESOLVED,
session: Optional[Session] = None
session: Optional[Session] = None,
):

if incident and incident.alerts_count == 0:
Expand Down Expand Up @@ -4533,19 +4574,15 @@ def is_all_alerts_in_status(
subquery = subquery.where(LastAlert.fingerprint.in_(fingerprints))

if incident:
subquery = (
subquery
.join(
subquery = subquery.join(
LastAlertToIncident,
and_(
LastAlertToIncident.tenant_id == LastAlert.tenant_id,
LastAlertToIncident.fingerprint == LastAlert.fingerprint,
),
)
.where(
LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT,
LastAlertToIncident.incident_id == incident.id,
)
).where(
LastAlertToIncident.deleted_at == NULL_FOR_DELETED_AT,
LastAlertToIncident.incident_id == incident.id,
)

subquery = subquery.subquery()
Expand Down Expand Up @@ -4920,8 +4957,8 @@ def set_last_alert(
timestamp=alert.timestamp,
first_timestamp=alert.timestamp,
alert_id=alert.id,
alert_hash=alert.alert_hash,
)
alert_hash=alert.alert_hash,
)

session.add(last_alert)
session.commit()
Expand Down
11 changes: 11 additions & 0 deletions keep/api/models/db/alert.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@
NULL_FOR_DELETED_AT = datetime(1000, 1, 1, 0, 0)


class IncidentType(str, enum.Enum):
MANUAL = "manual" # Created manually by users
AI = "ai" # Created by AI
RULE = "rule" # Created by rules engine
TOPOLOGY = "topology" # Created by topology processor


class AlertToIncident(SQLModel, table=True):
tenant_id: str = Field(foreign_key="tenant.id")
timestamp: datetime = Field(default_factory=datetime.utcnow)
Expand Down Expand Up @@ -157,6 +164,10 @@ class Incident(SQLModel, table=True):
# It's not a unique identifier in the DB (constraint), but when we have the same incident from some tools, we can use it to detect duplicates
fingerprint: str | None = Field(default=None, sa_column=Column(TEXT))

incident_type: str = Field(default=IncidentType.MANUAL.value)
# for topology incidents
incident_application: str | None = Field(default=None)

same_incident_in_the_past_id: UUID | None = Field(
sa_column=Column(
UUIDType(binary=False),
Expand Down
5 changes: 4 additions & 1 deletion keep/api/models/db/topology.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class TopologyService(SQLModel, table=True):
mac_address: Optional[str] = None
category: Optional[str] = None
manufacturer: Optional[str] = None
namespace: Optional[str] = None

updated_at: Optional[datetime] = Field(
sa_column=Column(
Expand All @@ -53,7 +54,7 @@ class TopologyService(SQLModel, table=True):
back_populates="service",
sa_relationship_kwargs={
"foreign_keys": "[TopologyServiceDependency.service_id]",
"cascade": "all, delete-orphan"
"cascade": "all, delete-orphan",
},
)

Expand Down Expand Up @@ -112,6 +113,7 @@ class TopologyServiceDtoBase(BaseModel, extra="ignore"):
mac_address: Optional[str] = None
category: Optional[str] = None
manufacturer: Optional[str] = None
namespace: Optional[str] = None


class TopologyServiceInDto(TopologyServiceDtoBase):
Expand Down Expand Up @@ -212,4 +214,5 @@ def from_orm(
],
application_ids=application_ids,
updated_at=service.updated_at,
namespace=service.namespace,
)
2 changes: 1 addition & 1 deletion keep/api/routes/preset.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def pull_data_from_providers(
try:
if isinstance(provider_class, BaseTopologyProvider):
logger.info("Pulling topology data", extra=extra)
topology_data = provider_class.pull_topology()
topology_data, _ = provider_class.pull_topology()
logger.info(
"Pulling topology data finished, processing",
extra={**extra, "topology_length": len(topology_data)},
Expand Down
Loading

0 comments on commit 53d6ccd

Please sign in to comment.