From cb66e26fd9e292aa28d81ddf39e3915c479320ae Mon Sep 17 00:00:00 2001
From: bloodearnest <simon.davy@thedatalab.org>
Date: Tue, 29 Aug 2023 14:14:39 +0100
Subject: [PATCH] Add telemetry to the stats collection

We're sometimes seeing missing metrics for some jobs. This addes a lot
more data to the tick traces so we can try diagnose why we have missing
metrics.

Part of #634
---
 jobrunner/record_stats.py  | 32 +++++++++++++++++++++++++----
 pyproject.toml             |  2 +-
 tests/test_record_stats.py | 41 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+), 5 deletions(-)

diff --git a/jobrunner/record_stats.py b/jobrunner/record_stats.py
index f6a02782..67fd96ba 100644
--- a/jobrunner/record_stats.py
+++ b/jobrunner/record_stats.py
@@ -46,12 +46,22 @@ def record_tick_trace(last_run):
     if last_run is None:
         return time.time_ns()
 
+    trace_attrs = {"stats_timeout": False, "stats_error": False}
+    stats = {}
+    error_attrs = {}
+
     try:
         stats = get_job_stats()
     except subprocess.TimeoutExpired:
         log.exception("Getting docker stats timed out")
-        # no metrics for this tick
-        stats = {}
+        trace_attrs["stats_timeout"] = True
+    except subprocess.CalledProcessError as exc:
+        log.exception("Error getting docker stats")
+        trace_attrs["stats_error"] = True
+
+        error_attrs["cmd"] = " ".join(exc.cmd)
+        error_attrs["exit_code"] = exc.returncode
+        error_attrs["output"] = exc.stderr + "\n\n" + exc.output
 
     # record time once stats call has completed, as it can take a while
     now = time.time_ns()
@@ -64,11 +74,25 @@ def record_tick_trace(last_run):
         models.Job, state__in=[models.State.PENDING, models.State.RUNNING]
     )
 
-    with tracer.start_as_current_span("TICK", start_time=start_time):
+    with tracer.start_as_current_span(
+        "TICK", start_time=start_time, attributes=trace_attrs
+    ) as root:
+        # add error event so we can see the error from the docker command
+        if error_attrs:
+            root.add_event("stats_error", attributes=error_attrs, timestamp=start_time)
+
         for job in active_jobs:
             span = tracer.start_span(job.status_code.name, start_time=start_time)
+
+            # set up attributes
+            job_span_attrs = {}
+            job_span_attrs.update(trace_attrs)
             metrics = stats.get(job.id, {})
-            tracing.set_span_metadata(span, job, **metrics)
+            job_span_attrs["has_metrics"] = metrics != {}
+            job_span_attrs.update(metrics)
+
+            # record span
+            tracing.set_span_metadata(span, job, **job_span_attrs)
             span.end(end_time)
 
     return end_time
diff --git a/pyproject.toml b/pyproject.toml
index c2e767c9..b29bd0b4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,7 +42,7 @@ dynamic_context = "test_function"
 source = ["jobrunner"]
 
 [tool.coverage.report]
-fail_under = 78
+fail_under = 80
 show_missing = true
 skip_covered = true
 
diff --git a/tests/test_record_stats.py b/tests/test_record_stats.py
index d7de3785..a6e389c8 100644
--- a/tests/test_record_stats.py
+++ b/tests/test_record_stats.py
@@ -50,9 +50,15 @@ def test_record_tick_trace(db, freezer, monkeypatch):
         assert span.attributes["job"] == job.id
         assert span.parent.span_id == root.context.span_id
 
+        assert span.attributes["stats_timeout"] is False
+        assert span.attributes["stats_error"] is False
+
         if job is running_job:
+            assert span.attributes["has_metrics"] is True
             assert span.attributes["cpu_percentage"] == 50.0
             assert span.attributes["memory_used"] == 1000
+        else:
+            assert span.attributes["has_metrics"] is False
 
     assert "SUCCEEDED" not in [s.name for s in spans]
 
@@ -76,3 +82,38 @@ def timeout():
 
     assert "cpu_percentage" not in span.attributes
     assert "memory_used" not in span.attributes
+    assert span.attributes["has_metrics"] is False
+    assert span.attributes["stats_timeout"] is True
+    assert span.attributes["stats_error"] is False
+
+
+def test_record_tick_trace_stats_error(db, freezer, monkeypatch):
+    job_factory(status_code=StatusCode.EXECUTING)
+
+    def error():
+        raise subprocess.CalledProcessError(
+            returncode=1, cmd=["test", "cmd"], output="stdout", stderr="stderr"
+        )
+
+    monkeypatch.setattr(record_stats, "get_job_stats", error)
+
+    last_run = time.time()
+    record_stats.record_tick_trace(last_run)
+    assert len(get_trace("ticks")) == 2
+
+    spans = get_trace("ticks")
+    span = spans[0]
+
+    assert "cpu_percentage" not in span.attributes
+    assert "memory_used" not in span.attributes
+    assert span.attributes["has_metrics"] is False
+    assert span.attributes["stats_timeout"] is False
+    assert span.attributes["stats_error"] is True
+
+    root = spans[1]
+    assert root.attributes["stats_timeout"] is False
+    assert root.attributes["stats_error"] is True
+    assert root.events[0].attributes["exit_code"] == 1
+    assert root.events[0].attributes["cmd"] == "test cmd"
+    assert root.events[0].attributes["output"] == "stderr\n\nstdout"
+    assert root.events[0].name == "stats_error"