Skip to content

Commit

Permalink
perf: handle metrics correctly
Browse files Browse the repository at this point in the history
added eventCacheRetriesTotal metric for capturing pod,process,parent(Info) retries along parentInfoErrors, processInfoErrors and podInfoErrors update

Signed-off-by: sadath-12 <[email protected]>
  • Loading branch information
sadath-12 authored and lambdanis committed Jan 15, 2024
1 parent 004d9cf commit 95974f4
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 14 deletions.
11 changes: 6 additions & 5 deletions pkg/eventcache/eventcache.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import (

"github.com/cilium/tetragon/api/v1/tetragon"
"github.com/cilium/tetragon/pkg/ktime"
"github.com/cilium/tetragon/pkg/metrics/errormetrics"
"github.com/cilium/tetragon/pkg/metrics/eventcachemetrics"
"github.com/cilium/tetragon/pkg/option"
"github.com/cilium/tetragon/pkg/process"
Expand Down Expand Up @@ -70,7 +69,7 @@ func HandleGenericInternal(ev notify.Event, pid uint32, tid *uint32, timestamp u
if parent != nil {
ev.SetParent(parent.UnsafeGetProcess())
} else {
errormetrics.ErrorTotalInc(errormetrics.EventCacheParentInfoFailed)
eventcachemetrics.EventCacheRetries(eventcachemetrics.ParentInfo).Inc()
err = ErrFailedToGetParentInfo
}

Expand All @@ -86,7 +85,7 @@ func HandleGenericInternal(ev notify.Event, pid uint32, tid *uint32, timestamp u
process.UpdateEventProcessTid(proc, tid)
ev.SetProcess(proc)
} else {
errormetrics.ErrorTotalInc(errormetrics.EventCacheProcessInfoFailed)
eventcachemetrics.EventCacheRetries(eventcachemetrics.ProcessInfo).Inc()
err = ErrFailedToGetProcessInfo
}

Expand All @@ -103,7 +102,7 @@ func HandleGenericInternal(ev notify.Event, pid uint32, tid *uint32, timestamp u
func HandleGenericEvent(internal *process.ProcessInternal, ev notify.Event, tid *uint32) error {
p := internal.UnsafeGetProcess()
if option.Config.EnableK8s && p.Pod == nil {
errormetrics.ErrorTotalInc(errormetrics.EventCachePodInfoRetryFailed)
eventcachemetrics.EventCacheRetries(eventcachemetrics.PodInfo).Inc()
return ErrFailedToGetPodInfo
}

Expand Down Expand Up @@ -141,7 +140,9 @@ func (ec *Cache) handleEvents() {
tmp = append(tmp, event)
continue
}
if errors.Is(err, ErrFailedToGetProcessInfo) {
if errors.Is(err, ErrFailedToGetParentInfo) {
eventcachemetrics.ParentInfoError(notify.EventTypeString(event.event)).Inc()
} else if errors.Is(err, ErrFailedToGetProcessInfo) {
eventcachemetrics.ProcessInfoError(notify.EventTypeString(event.event)).Inc()
} else if errors.Is(err, ErrFailedToGetPodInfo) {
eventcachemetrics.PodInfoError(notify.EventTypeString(event.event)).Inc()
Expand Down
8 changes: 5 additions & 3 deletions pkg/grpc/exec/exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ func (msg *MsgExecveEventUnix) Retry(internal *process.ProcessInternal, ev notif
if option.Config.EnableK8s && containerId != "" {
podInfo = process.GetPodInfo(containerId, filename, args, nspid)
if podInfo == nil {
errormetrics.ErrorTotalInc(errormetrics.EventCachePodInfoRetryFailed)
eventcachemetrics.EventCacheRetries(eventcachemetrics.PodInfo).Inc()
return eventcache.ErrFailedToGetPodInfo
}
}
Expand Down Expand Up @@ -434,7 +434,7 @@ func (msg *MsgExitEventUnix) RetryInternal(ev notify.Event, timestamp uint64) (*
msg.RefCntDone[ParentRefCnt] = true
}
} else {
errormetrics.ErrorTotalInc(errormetrics.EventCacheParentInfoFailed)
eventcachemetrics.EventCacheRetries(eventcachemetrics.ParentInfo).Inc()
err = eventcache.ErrFailedToGetParentInfo
}

Expand All @@ -446,7 +446,7 @@ func (msg *MsgExitEventUnix) RetryInternal(ev notify.Event, timestamp uint64) (*
msg.RefCntDone[ProcessRefCnt] = true
}
} else {
errormetrics.ErrorTotalInc(errormetrics.EventCacheProcessInfoFailed)
eventcachemetrics.EventCacheRetries(eventcachemetrics.ProcessInfo).Inc()
err = eventcache.ErrFailedToGetProcessInfo
}

Expand Down Expand Up @@ -505,6 +505,7 @@ func (msg *MsgProcessCleanupEventUnix) RetryInternal(_ notify.Event, timestamp u
msg.RefCntDone[ParentRefCnt] = true
}
} else {
eventcachemetrics.EventCacheRetries(eventcachemetrics.ParentInfo).Inc()
err = eventcache.ErrFailedToGetParentInfo
}

Expand All @@ -514,6 +515,7 @@ func (msg *MsgProcessCleanupEventUnix) RetryInternal(_ notify.Event, timestamp u
msg.RefCntDone[ProcessRefCnt] = true
}
} else {
eventcachemetrics.EventCacheRetries(eventcachemetrics.ProcessInfo).Inc()
err = eventcache.ErrFailedToGetProcessInfo
}

Expand Down
6 changes: 0 additions & 6 deletions pkg/metrics/errormetrics/errormetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,6 @@ var (
ProcessCacheMissOnRemove ErrorType = "process_cache_miss_on_remove"
// Tid and Pid mismatch that could affect BPF and user space caching logic
ProcessPidTidMismatch ErrorType = "process_pid_tid_mismatch"
// Event cache podInfo retries failed.
EventCachePodInfoRetryFailed ErrorType = "event_cache_podinfo_retry_failed"
// Event cache failed to set process information for an event.
EventCacheProcessInfoFailed ErrorType = "event_cache_process_info_failed"
// Event cache failed to set parent information for an event.
EventCacheParentInfoFailed ErrorType = "event_cache_parent_info_failed"
// An event is missing process info.
EventMissingProcessInfo ErrorType = "event_missing_process_info"
// An error occurred in an event handler.
Expand Down
29 changes: 29 additions & 0 deletions pkg/metrics/eventcachemetrics/eventcachemetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ import (
"github.com/prometheus/client_golang/prometheus"
)

const (
ProcessInfo = "process_info"
ParentInfo = "parent_info"
PodInfo = "pod_info"
)

var (
processInfoErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Expand All @@ -33,13 +39,26 @@ var (
Help: "The total of errors encountered while fetching process exec information from the cache.",
ConstLabels: nil,
}, []string{"error"})
eventCacheRetriesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Name: "event_cache_retries_total",
Help: "The total number of retries for event caching per entry type.",
}, []string{"entry_type"})
parentInfoErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Name: "event_cache_parent_info_errors_total",
Help: "The total of times we failed to fetch cached parent info for a given event type.",
ConstLabels: nil,
}, []string{"event_type"})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(processInfoErrors)
registry.MustRegister(podInfoErrors)
registry.MustRegister(EventCacheCount)
registry.MustRegister(eventCacheErrorsTotal)
registry.MustRegister(eventCacheRetriesTotal)
registry.MustRegister(parentInfoErrors)
}

// Get a new handle on an processInfoErrors metric for an eventType
Expand All @@ -56,3 +75,13 @@ func PodInfoError(eventType string) prometheus.Counter {
func EventCacheError(err string) prometheus.Counter {
return eventCacheErrorsTotal.WithLabelValues(err)
}

// Get a new handle on the eventCacheRetriesTotal metric for an entryType
func EventCacheRetries(entryType string) prometheus.Counter {
return eventCacheRetriesTotal.WithLabelValues(entryType)
}

// Get a new handle on an processInfoErrors metric for an eventType
func ParentInfoError(eventType string) prometheus.Counter {
return parentInfoErrors.WithLabelValues(eventType)
}

0 comments on commit 95974f4

Please sign in to comment.