Skip to content

Commit

Permalink
Refactor kubedl_job_status implementation to log status changes for e…
Browse files Browse the repository at this point in the history
…ach job and add metric: kubedl_job_finished_time
  • Loading branch information
13241308289 authored and paradox committed Mar 4, 2024
1 parent b93a2b4 commit 1cea838
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 6 deletions.
9 changes: 9 additions & 0 deletions pkg/job_controller/api/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,15 @@ const (
JobFailed JobConditionType = "Failed"
)

var JobConditionTypeValueMap = map[JobConditionType]float64{
JobCreated: 0,
JobQueuing: 1,
JobRunning: 2,
JobRestarting: 3,
JobSucceeded: 4,
JobFailed: 5,
}

// SuccessPolicy is the policy to mark the job as succeeded, when the job does not contain the chief or master role.
type SuccessPolicy string

Expand Down
30 changes: 24 additions & 6 deletions pkg/metrics/job_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,14 @@ var (
Name: "kubedl_jobs_all_pods_launch_delay_seconds",
Help: "Histogram for recording sync launch delay duration(from job created to all pods running).",
}, []string{"kind", "name", "namespace", "uid"})
jobStatus = promauto.NewHistogramVec(prometheus.HistogramOpts{
jobStatus = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "kubedl_job_status",
Help: "Counts number of jobs with failed status",
}, []string{"kind", "name", "namespace", "uid", "status", "reason"})
}, []string{"kind", "name", "namespace", "uid"})
jobFinishedTime = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "kubedl_job_finished_time",
Help: "Job finished time",
}, []string{"kind", "name", "namespace", "uid"})
)

// JobMetrics holds the kinds of metrics counter for some type of job workload.
Expand All @@ -75,7 +79,8 @@ type JobMetrics struct {
restart prometheus.Counter
firstPodLaunchDelay *prometheus.HistogramVec
allPodsLaunchDelay *prometheus.HistogramVec
jobStatus *prometheus.HistogramVec
jobStatus *prometheus.GaugeVec
jobFinishedTime *prometheus.GaugeVec
}

func NewJobMetrics(kind string, client client.Client) *JobMetrics {
Expand All @@ -91,6 +96,7 @@ func NewJobMetrics(kind string, client client.Client) *JobMetrics {
firstPodLaunchDelay: firstPodLaunchDelayHist,
allPodsLaunchDelay: allPodsLaunchDelayHist,
jobStatus: jobStatus,
jobFinishedTime: jobFinishedTime,

Check warning on line 99 in pkg/metrics/job_metrics.go

View check run for this annotation

Codecov / codecov/patch

pkg/metrics/job_metrics.go#L99

Added line #L99 was not covered by tests
}
// Register running gauge func on center prometheus demand pull.
// Different kinds of workload metrics share the same metric name and help info,
Expand Down Expand Up @@ -146,17 +152,29 @@ func (m *JobMetrics) RestartInc() {
func (m *JobMetrics) JobStatusMetrics(job metav1.Object, status v1.JobStatus) {
for _, condition := range status.Conditions {
if condition.Status == corev1.ConditionTrue {
value, ok := v1.JobConditionTypeValueMap[condition.Type]
if !ok {
continue

Check warning on line 157 in pkg/metrics/job_metrics.go

View check run for this annotation

Codecov / codecov/patch

pkg/metrics/job_metrics.go#L155-L157

Added lines #L155 - L157 were not covered by tests
}
m.jobStatus.With(prometheus.Labels{
"kind": m.kind,
"name": job.GetName(),
"namespace": job.GetNamespace(),
"uid": string(job.GetUID()),
"status": string(condition.Type),
"reason": condition.Reason,
}).Observe(1)
}).Set(value)

Check warning on line 164 in pkg/metrics/job_metrics.go

View check run for this annotation

Codecov / codecov/patch

pkg/metrics/job_metrics.go#L164

Added line #L164 was not covered by tests

if condition.Type == v1.JobSucceeded || condition.Type == v1.JobFailed {
m.jobFinishedTime.With(prometheus.Labels{
"kind": m.kind,
"name": job.GetName(),
"namespace": job.GetNamespace(),
"uid": string(job.GetUID()),
}).Set(float64(condition.LastTransitionTime.Unix()))

Check warning on line 172 in pkg/metrics/job_metrics.go

View check run for this annotation

Codecov / codecov/patch

pkg/metrics/job_metrics.go#L166-L172

Added lines #L166 - L172 were not covered by tests
}
break
}
}

}

func (m *JobMetrics) FirstPodLaunchDelaySeconds(activePods []*corev1.Pod, job metav1.Object, status v1.JobStatus) {
Expand Down

0 comments on commit 1cea838

Please sign in to comment.