diff --git a/cloud/bq/dedup.go b/cloud/bq/dedup.go index 6ba7f84e..f36cc6d7 100644 --- a/cloud/bq/dedup.go +++ b/cloud/bq/dedup.go @@ -31,6 +31,8 @@ var ( ) // WaitForStableTable loops checking until table exists and has no streaming buffer. +// NOTE: We discovered that the StreamingBuffer == nil is not reliable, and must be rechecked for a minimum +// of 5 minutes to ensure that there is no more buffered data. (We saw reverts after as long as 4m50s). // TODO - refactor this to make it easier to understand. // TODO - move these functions to go/bqext package func WaitForStableTable(ctx context.Context, tt bqiface.Table) error { diff --git a/metrics/metrics.go b/metrics/metrics.go index 75b1f035..6a426c69 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -10,6 +10,7 @@ func init() { prometheus.MustRegister(StartedCount) prometheus.MustRegister(CompletedCount) prometheus.MustRegister(StateTimeSummary) + prometheus.MustRegister(StateTimeHistogram) prometheus.MustRegister(StateDate) prometheus.MustRegister(FilesPerDateHistogram) prometheus.MustRegister(BytesPerDateHistogram) @@ -102,6 +103,7 @@ var ( ) // StateTimeSummary measures the time spent in different task states. + // DEPRECATED - remove soon. // Provides metrics: // gardener_state_time_summary // Example usage: @@ -113,6 +115,23 @@ var ( }, []string{"state"}, ) + // StateTimeHistogram tracks the time spent in each state. Not necessary to label data type, as + // we currently have separate gardener deployments for each type. + // Usage example: + // metrics.StateTimeHistogram.WithLabelValues( + // StateName[state]).Observe(time.Since(start).Seconds()) + StateTimeHistogram = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "gardener_state_time_histogram", + Help: "time-in-state distributions.", + // These values range from seconds to hours. + Buckets: []float64{ + 0.1, 0.3, 1, 3, 10, 30, + 100, 300, 1000, 3000, 10000, 30000, // many hours + }, + }, + []string{"state"}) + // FilesPerDateHistogram provides a histogram of files per date submitted to pipeline. // // Provides metrics: diff --git a/state/state.go b/state/state.go index fc4111d1..f46b038a 100644 --- a/state/state.go +++ b/state/state.go @@ -219,7 +219,9 @@ func (t *Task) Save(ctx context.Context) error { // Update updates the task state, and saves to the "saver". func (t *Task) Update(ctx context.Context, st State) error { - duration := t.UpdateTime.Sub(time.Now()) + duration := time.Since(t.UpdateTime) + metrics.StateTimeHistogram.WithLabelValues(StateNames[t.State]).Observe(duration.Seconds()) + // TODO - remove this once we have some histogram history. metrics.StateTimeSummary.WithLabelValues(StateNames[t.State]).Observe(duration.Seconds()) t.State = st t.UpdateTime = time.Now()