Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor kubedl_job_status implementation to log status changes for each job and add metric: kubedl_job_finished_time #313

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pkg/job_controller/api/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,15 @@ const (
JobFailed JobConditionType = "Failed"
)

var JobConditionTypeValueMap = map[JobConditionType]float64{
JobCreated: 0,
JobQueuing: 1,
JobRunning: 2,
JobRestarting: 3,
JobSucceeded: 4,
JobFailed: 5,
}

// SuccessPolicy is the policy to mark the job as succeeded, when the job does not contain the chief or master role.
type SuccessPolicy string

Expand Down
30 changes: 24 additions & 6 deletions pkg/metrics/job_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,14 @@
Name: "kubedl_jobs_all_pods_launch_delay_seconds",
Help: "Histogram for recording sync launch delay duration(from job created to all pods running).",
}, []string{"kind", "name", "namespace", "uid"})
jobStatus = promauto.NewHistogramVec(prometheus.HistogramOpts{
jobStatus = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "kubedl_job_status",
Help: "Counts number of jobs with failed status",
}, []string{"kind", "name", "namespace", "uid", "status", "reason"})
}, []string{"kind", "name", "namespace", "uid"})
jobFinishedTime = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "kubedl_job_finished_time",
Help: "Job finished time",
}, []string{"kind", "name", "namespace", "uid"})
)

// JobMetrics holds the kinds of metrics counter for some type of job workload.
Expand All @@ -75,7 +79,8 @@
restart prometheus.Counter
firstPodLaunchDelay *prometheus.HistogramVec
allPodsLaunchDelay *prometheus.HistogramVec
jobStatus *prometheus.HistogramVec
jobStatus *prometheus.GaugeVec
jobFinishedTime *prometheus.GaugeVec
}

func NewJobMetrics(kind string, client client.Client) *JobMetrics {
Expand All @@ -91,6 +96,7 @@
firstPodLaunchDelay: firstPodLaunchDelayHist,
allPodsLaunchDelay: allPodsLaunchDelayHist,
jobStatus: jobStatus,
jobFinishedTime: jobFinishedTime,

Check warning on line 99 in pkg/metrics/job_metrics.go

View check run for this annotation

Codecov / codecov/patch

pkg/metrics/job_metrics.go#L99

Added line #L99 was not covered by tests
}
// Register running gauge func on center prometheus demand pull.
// Different kinds of workload metrics share the same metric name and help info,
Expand Down Expand Up @@ -145,18 +151,30 @@

func (m *JobMetrics) JobStatusMetrics(job metav1.Object, status v1.JobStatus) {
for _, condition := range status.Conditions {
value, ok := v1.JobConditionTypeValueMap[condition.Type]
if !ok {
continue

Check warning on line 156 in pkg/metrics/job_metrics.go

View check run for this annotation

Codecov / codecov/patch

pkg/metrics/job_metrics.go#L154-L156

Added lines #L154 - L156 were not covered by tests
}
if condition.Status == corev1.ConditionTrue {
m.jobStatus.With(prometheus.Labels{
"kind": m.kind,
"name": job.GetName(),
"namespace": job.GetNamespace(),
"uid": string(job.GetUID()),
"status": string(condition.Type),
"reason": condition.Reason,
}).Observe(1)
}).Set(value)

Check warning on line 164 in pkg/metrics/job_metrics.go

View check run for this annotation

Codecov / codecov/patch

pkg/metrics/job_metrics.go#L164

Added line #L164 was not covered by tests

if condition.Type == v1.JobSucceeded || condition.Type == v1.JobFailed {
m.jobFinishedTime.With(prometheus.Labels{
"kind": m.kind,
"name": job.GetName(),
"namespace": job.GetNamespace(),
"uid": string(job.GetUID()),
}).Set(float64(condition.LastTransitionTime.Unix()))

Check warning on line 172 in pkg/metrics/job_metrics.go

View check run for this annotation

Codecov / codecov/patch

pkg/metrics/job_metrics.go#L166-L172

Added lines #L166 - L172 were not covered by tests
}
break
}
}

}

func (m *JobMetrics) FirstPodLaunchDelaySeconds(activePods []*corev1.Pod, job metav1.Object, status v1.JobStatus) {
Expand Down
Loading