From cb66e26fd9e292aa28d81ddf39e3915c479320ae Mon Sep 17 00:00:00 2001 From: bloodearnest Date: Tue, 29 Aug 2023 14:14:39 +0100 Subject: [PATCH] Add telemetry to the stats collection We're sometimes seeing missing metrics for some jobs. This addes a lot more data to the tick traces so we can try diagnose why we have missing metrics. Part of #634 --- jobrunner/record_stats.py | 32 +++++++++++++++++++++++++---- pyproject.toml | 2 +- tests/test_record_stats.py | 41 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 5 deletions(-) diff --git a/jobrunner/record_stats.py b/jobrunner/record_stats.py index f6a02782..67fd96ba 100644 --- a/jobrunner/record_stats.py +++ b/jobrunner/record_stats.py @@ -46,12 +46,22 @@ def record_tick_trace(last_run): if last_run is None: return time.time_ns() + trace_attrs = {"stats_timeout": False, "stats_error": False} + stats = {} + error_attrs = {} + try: stats = get_job_stats() except subprocess.TimeoutExpired: log.exception("Getting docker stats timed out") - # no metrics for this tick - stats = {} + trace_attrs["stats_timeout"] = True + except subprocess.CalledProcessError as exc: + log.exception("Error getting docker stats") + trace_attrs["stats_error"] = True + + error_attrs["cmd"] = " ".join(exc.cmd) + error_attrs["exit_code"] = exc.returncode + error_attrs["output"] = exc.stderr + "\n\n" + exc.output # record time once stats call has completed, as it can take a while now = time.time_ns() @@ -64,11 +74,25 @@ def record_tick_trace(last_run): models.Job, state__in=[models.State.PENDING, models.State.RUNNING] ) - with tracer.start_as_current_span("TICK", start_time=start_time): + with tracer.start_as_current_span( + "TICK", start_time=start_time, attributes=trace_attrs + ) as root: + # add error event so we can see the error from the docker command + if error_attrs: + root.add_event("stats_error", attributes=error_attrs, timestamp=start_time) + for job in active_jobs: span = tracer.start_span(job.status_code.name, start_time=start_time) + + # set up attributes + job_span_attrs = {} + job_span_attrs.update(trace_attrs) metrics = stats.get(job.id, {}) - tracing.set_span_metadata(span, job, **metrics) + job_span_attrs["has_metrics"] = metrics != {} + job_span_attrs.update(metrics) + + # record span + tracing.set_span_metadata(span, job, **job_span_attrs) span.end(end_time) return end_time diff --git a/pyproject.toml b/pyproject.toml index c2e767c9..b29bd0b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ dynamic_context = "test_function" source = ["jobrunner"] [tool.coverage.report] -fail_under = 78 +fail_under = 80 show_missing = true skip_covered = true diff --git a/tests/test_record_stats.py b/tests/test_record_stats.py index d7de3785..a6e389c8 100644 --- a/tests/test_record_stats.py +++ b/tests/test_record_stats.py @@ -50,9 +50,15 @@ def test_record_tick_trace(db, freezer, monkeypatch): assert span.attributes["job"] == job.id assert span.parent.span_id == root.context.span_id + assert span.attributes["stats_timeout"] is False + assert span.attributes["stats_error"] is False + if job is running_job: + assert span.attributes["has_metrics"] is True assert span.attributes["cpu_percentage"] == 50.0 assert span.attributes["memory_used"] == 1000 + else: + assert span.attributes["has_metrics"] is False assert "SUCCEEDED" not in [s.name for s in spans] @@ -76,3 +82,38 @@ def timeout(): assert "cpu_percentage" not in span.attributes assert "memory_used" not in span.attributes + assert span.attributes["has_metrics"] is False + assert span.attributes["stats_timeout"] is True + assert span.attributes["stats_error"] is False + + +def test_record_tick_trace_stats_error(db, freezer, monkeypatch): + job_factory(status_code=StatusCode.EXECUTING) + + def error(): + raise subprocess.CalledProcessError( + returncode=1, cmd=["test", "cmd"], output="stdout", stderr="stderr" + ) + + monkeypatch.setattr(record_stats, "get_job_stats", error) + + last_run = time.time() + record_stats.record_tick_trace(last_run) + assert len(get_trace("ticks")) == 2 + + spans = get_trace("ticks") + span = spans[0] + + assert "cpu_percentage" not in span.attributes + assert "memory_used" not in span.attributes + assert span.attributes["has_metrics"] is False + assert span.attributes["stats_timeout"] is False + assert span.attributes["stats_error"] is True + + root = spans[1] + assert root.attributes["stats_timeout"] is False + assert root.attributes["stats_error"] is True + assert root.events[0].attributes["exit_code"] == 1 + assert root.events[0].attributes["cmd"] == "test cmd" + assert root.events[0].attributes["output"] == "stderr\n\nstdout" + assert root.events[0].name == "stats_error"