From a483eec49aa40ea87bee373a3895a23abc40c5a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E4=B9=90?= <2519960931@qq.com> Date: Mon, 4 Mar 2024 17:37:46 +0800 Subject: [PATCH] Refactor kubedl_job_status implementation to log status changes for each job and add metric: kubedl_job_finished_time Signed-off-by: paradox <2519960931@qq.com> --- pkg/job_controller/api/v1/types.go | 9 +++++++++ pkg/metrics/job_metrics.go | 30 ++++++++++++++++++++++++------ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/pkg/job_controller/api/v1/types.go b/pkg/job_controller/api/v1/types.go index 2ac1acb7..59cbf197 100644 --- a/pkg/job_controller/api/v1/types.go +++ b/pkg/job_controller/api/v1/types.go @@ -168,6 +168,15 @@ const ( JobFailed JobConditionType = "Failed" ) +var JobConditionTypeValueMap = map[JobConditionType]float64{ + JobCreated: 0, + JobQueuing: 1, + JobRunning: 2, + JobRestarting: 3, + JobSucceeded: 4, + JobFailed: 5, +} + // SuccessPolicy is the policy to mark the job as succeeded, when the job does not contain the chief or master role. type SuccessPolicy string diff --git a/pkg/metrics/job_metrics.go b/pkg/metrics/job_metrics.go index 7beeeae4..7a068584 100644 --- a/pkg/metrics/job_metrics.go +++ b/pkg/metrics/job_metrics.go @@ -59,10 +59,14 @@ var ( Name: "kubedl_jobs_all_pods_launch_delay_seconds", Help: "Histogram for recording sync launch delay duration(from job created to all pods running).", }, []string{"kind", "name", "namespace", "uid"}) - jobStatus = promauto.NewHistogramVec(prometheus.HistogramOpts{ + jobStatus = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "kubedl_job_status", Help: "Counts number of jobs with failed status", - }, []string{"kind", "name", "namespace", "uid", "status", "reason"}) + }, []string{"kind", "name", "namespace", "uid"}) + jobFinishedTime = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "kubedl_job_finished_time", + Help: "Job finished time", + }, []string{"kind", "name", "namespace", "uid"}) ) // JobMetrics holds the kinds of metrics counter for some type of job workload. @@ -75,7 +79,8 @@ type JobMetrics struct { restart prometheus.Counter firstPodLaunchDelay *prometheus.HistogramVec allPodsLaunchDelay *prometheus.HistogramVec - jobStatus *prometheus.HistogramVec + jobStatus *prometheus.GaugeVec + jobFinishedTime *prometheus.GaugeVec } func NewJobMetrics(kind string, client client.Client) *JobMetrics { @@ -91,6 +96,7 @@ func NewJobMetrics(kind string, client client.Client) *JobMetrics { firstPodLaunchDelay: firstPodLaunchDelayHist, allPodsLaunchDelay: allPodsLaunchDelayHist, jobStatus: jobStatus, + jobFinishedTime: jobFinishedTime, } // Register running gauge func on center prometheus demand pull. // Different kinds of workload metrics share the same metric name and help info, @@ -146,17 +152,29 @@ func (m *JobMetrics) RestartInc() { func (m *JobMetrics) JobStatusMetrics(job metav1.Object, status v1.JobStatus) { for _, condition := range status.Conditions { if condition.Status == corev1.ConditionTrue { + value, ok := v1.JobConditionTypeValueMap[condition.Type] + if !ok { + continue + } m.jobStatus.With(prometheus.Labels{ "kind": m.kind, "name": job.GetName(), "namespace": job.GetNamespace(), "uid": string(job.GetUID()), - "status": string(condition.Type), - "reason": condition.Reason, - }).Observe(1) + }).Set(value) + + if condition.Type == v1.JobSucceeded || condition.Type == v1.JobFailed { + m.jobFinishedTime.With(prometheus.Labels{ + "kind": m.kind, + "name": job.GetName(), + "namespace": job.GetNamespace(), + "uid": string(job.GetUID()), + }).Set(float64(condition.LastTransitionTime.Unix())) + } break } } + } func (m *JobMetrics) FirstPodLaunchDelaySeconds(activePods []*corev1.Pod, job metav1.Object, status v1.JobStatus) {