Skip to content

Commit

Permalink
Merge pull request #645 from opensafely-core/telemetrics
Browse files Browse the repository at this point in the history
Add telemetry to the stats collection
  • Loading branch information
bloodearnest authored Aug 29, 2023
2 parents ebdcfde + cb66e26 commit 022419b
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 5 deletions.
32 changes: 28 additions & 4 deletions jobrunner/record_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,22 @@ def record_tick_trace(last_run):
if last_run is None:
return time.time_ns()

trace_attrs = {"stats_timeout": False, "stats_error": False}
stats = {}
error_attrs = {}

try:
stats = get_job_stats()
except subprocess.TimeoutExpired:
log.exception("Getting docker stats timed out")
# no metrics for this tick
stats = {}
trace_attrs["stats_timeout"] = True
except subprocess.CalledProcessError as exc:
log.exception("Error getting docker stats")
trace_attrs["stats_error"] = True

error_attrs["cmd"] = " ".join(exc.cmd)
error_attrs["exit_code"] = exc.returncode
error_attrs["output"] = exc.stderr + "\n\n" + exc.output

# record time once stats call has completed, as it can take a while
now = time.time_ns()
Expand All @@ -64,11 +74,25 @@ def record_tick_trace(last_run):
models.Job, state__in=[models.State.PENDING, models.State.RUNNING]
)

with tracer.start_as_current_span("TICK", start_time=start_time):
with tracer.start_as_current_span(
"TICK", start_time=start_time, attributes=trace_attrs
) as root:
# add error event so we can see the error from the docker command
if error_attrs:
root.add_event("stats_error", attributes=error_attrs, timestamp=start_time)

for job in active_jobs:
span = tracer.start_span(job.status_code.name, start_time=start_time)

# set up attributes
job_span_attrs = {}
job_span_attrs.update(trace_attrs)
metrics = stats.get(job.id, {})
tracing.set_span_metadata(span, job, **metrics)
job_span_attrs["has_metrics"] = metrics != {}
job_span_attrs.update(metrics)

# record span
tracing.set_span_metadata(span, job, **job_span_attrs)
span.end(end_time)

return end_time
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ dynamic_context = "test_function"
source = ["jobrunner"]

[tool.coverage.report]
fail_under = 78
fail_under = 80
show_missing = true
skip_covered = true

Expand Down
41 changes: 41 additions & 0 deletions tests/test_record_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,15 @@ def test_record_tick_trace(db, freezer, monkeypatch):
assert span.attributes["job"] == job.id
assert span.parent.span_id == root.context.span_id

assert span.attributes["stats_timeout"] is False
assert span.attributes["stats_error"] is False

if job is running_job:
assert span.attributes["has_metrics"] is True
assert span.attributes["cpu_percentage"] == 50.0
assert span.attributes["memory_used"] == 1000
else:
assert span.attributes["has_metrics"] is False

assert "SUCCEEDED" not in [s.name for s in spans]

Expand All @@ -76,3 +82,38 @@ def timeout():

assert "cpu_percentage" not in span.attributes
assert "memory_used" not in span.attributes
assert span.attributes["has_metrics"] is False
assert span.attributes["stats_timeout"] is True
assert span.attributes["stats_error"] is False


def test_record_tick_trace_stats_error(db, freezer, monkeypatch):
job_factory(status_code=StatusCode.EXECUTING)

def error():
raise subprocess.CalledProcessError(
returncode=1, cmd=["test", "cmd"], output="stdout", stderr="stderr"
)

monkeypatch.setattr(record_stats, "get_job_stats", error)

last_run = time.time()
record_stats.record_tick_trace(last_run)
assert len(get_trace("ticks")) == 2

spans = get_trace("ticks")
span = spans[0]

assert "cpu_percentage" not in span.attributes
assert "memory_used" not in span.attributes
assert span.attributes["has_metrics"] is False
assert span.attributes["stats_timeout"] is False
assert span.attributes["stats_error"] is True

root = spans[1]
assert root.attributes["stats_timeout"] is False
assert root.attributes["stats_error"] is True
assert root.events[0].attributes["exit_code"] == 1
assert root.events[0].attributes["cmd"] == "test cmd"
assert root.events[0].attributes["output"] == "stderr\n\nstdout"
assert root.events[0].name == "stats_error"

0 comments on commit 022419b

Please sign in to comment.