Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cuDF spilling statistics to RMM/GPU memory plot #8148

Merged
merged 26 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7fc10ce
Initial exposure of cuDF logging information
charlesbluca Aug 30, 2023
04137ef
Initial plot of GPU to CPU nbytes
charlesbluca Aug 31, 2023
d38de06
Refactor RMM plot to include spilled memory
charlesbluca Sep 1, 2023
eeddf1e
Fix memory limit on x axis
charlesbluca Sep 1, 2023
043835c
Remove unused dashboard plot
charlesbluca Sep 1, 2023
0ac3344
Allow MemoryColor colors to be overridden
charlesbluca Sep 6, 2023
698be13
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 6, 2023
bb49135
Linting
charlesbluca Sep 6, 2023
628fe39
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 7, 2023
50e626a
Add cudf diagnostics test
charlesbluca Sep 7, 2023
98e283e
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 11, 2023
008cca8
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 12, 2023
082ddef
Resolve bokeh test failures
charlesbluca Sep 12, 2023
87f8020
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 19, 2023
4453b8b
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 26, 2023
b60173d
Make cudf spilling monitoring optional and disabled by default
charlesbluca Sep 28, 2023
7426548
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 28, 2023
1890136
Modify cudf spilling test
charlesbluca Sep 28, 2023
5eafddc
Test cuDF spill tests in separate process
charlesbluca Sep 29, 2023
21e106b
Remove global cuDF spilling settings from build.sh
charlesbluca Oct 2, 2023
3cc4b94
cuDF metrics test is flaky
charlesbluca Oct 3, 2023
98dbfc7
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Oct 10, 2023
a5fce3c
Merge remote-tracking branch 'upstream/main' into cudf-spilling-dashb…
charlesbluca Oct 25, 2023
b2fdfc6
Shouldn't need dask-cuda worker for test
charlesbluca Oct 25, 2023
f96b8a4
Merge remote-tracking branch 'upstream/main' into pr/charlesbluca/8148
charlesbluca Dec 11, 2023
936f0f6
Merge remote-tracking branch 'upstream/main' into cudf-spilling-dashb…
charlesbluca Dec 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
346 changes: 172 additions & 174 deletions distributed/dashboard/components/rmm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import math
from textwrap import dedent
from collections.abc import Iterable
from typing import TypeVar

from bokeh.core.properties import without_property_validation
from bokeh.models import (
Expand All @@ -10,6 +10,7 @@
HoverTool,
NumeralTickFormatter,
OpenURL,
Range1d,
TapTool,
)
from bokeh.plotting import figure
Expand All @@ -18,191 +19,188 @@
from dask.utils import format_bytes

from distributed.dashboard.components import DashboardComponent, add_periodic_callback
from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024
from distributed.dashboard.components.scheduler import (
BOKEH_THEME,
TICKS_1024,
XLABEL_ORIENTATION,
MemoryColor,
)
from distributed.dashboard.utils import update
from distributed.utils import log_errors

T = TypeVar("T")


class RMMMemoryUsage(DashboardComponent):
class RMMMemoryUsage(DashboardComponent, MemoryColor):
"""
GPU memory usage plot that includes information about memory
managed by RMM. If an RMM pool is being used, shows the amount of
pool memory utilized.
"""

@log_errors
def __init__(self, scheduler, width=600, **kwargs):
with log_errors():
self.last = 0
self.scheduler = scheduler
self.source = ColumnDataSource(
{
"rmm-used": [1, 2],
"rmm-used-half": [0.5, 1],
"rmm-total": [2, 4],
"rmm-total-half": [1, 2],
"external-used": [2, 1],
"external-used-x": [3, 4.5],
"worker": ["a", "b"],
"gpu-index": [0, 0],
"y": [1, 2],
"escaped_worker": ["a", "b"],
"rmm_memory_text": [
"RMM memory used: 1B/1B\nTotal GPU memory used: 1B/2B",
"RMM memory used: 1B/1B\nTotal GPU memory used: 1B/2B",
],
}
)

memory = figure(
title="RMM Memory",
tools="",
width=int(width / 2),
name="rmm_memory_histogram",
**kwargs,
)

rect = memory.rect(
source=self.source,
x="rmm-used-half",
y="y",
width="rmm-used",
height=1,
color="#76B900",
alpha=1.0,
)
rect.nonselection_glyph = None

rect = memory.rect(
source=self.source,
x="rmm-total-half",
y="y",
width="rmm-total",
height=1,
color="#76B900",
alpha=0.75,
)
rect.nonselection_glyph = None

rect = memory.rect(
source=self.source,
x="external-used-x",
y="y",
width="external-used",
height=1,
color="#76B900",
alpha=0.5,
)
rect.nonselection_glyph = None

memory.axis[0].ticker = BasicTicker(**TICKS_1024)
memory.xaxis[0].formatter = NumeralTickFormatter(format="0.0 b")
memory.xaxis.major_label_orientation = -math.pi / 12
memory.x_range.start = 0

for fig in [memory]:
fig.xaxis.minor_tick_line_alpha = 0
fig.yaxis.visible = False
fig.ygrid.visible = False

tap = TapTool(
callback=OpenURL(url="./info/worker/@escaped_worker.html")
)
fig.add_tools(tap)

fig.toolbar_location = None
fig.yaxis.visible = False

hover = HoverTool()
hover.tooltips = "@worker : @rmm_memory_text"
hover.point_policy = "follow_mouse"
memory.add_tools(hover)

self.memory_figure = memory
DashboardComponent.__init__(self)
MemoryColor.__init__(self)

self.last = 0
self.scheduler = scheduler
self.source = ColumnDataSource(
{
"width": [],
"x": [],
"y": [],
"color": [],
"alpha": [],
"worker": [],
"escaped_worker": [],
"rmm_used": [],
"rmm_total": [],
"gpu_used": [],
"gpu_total": [],
"spilled": [],
}
)

self.root = figure(
title="RMM memory used",
tools="",
width=int(width / 2),
name="rmm_memory",
**kwargs,
)
rect = self.root.rect(
source=self.source,
x="x",
y="y",
width="width",
height=0.9,
color="color",
fill_alpha="alpha",
line_width=0,
)
rect.nonselection_glyph = None

self.root.axis[0].ticker = BasicTicker(**TICKS_1024)
self.root.xaxis[0].formatter = NumeralTickFormatter(format="0.0 b")
self.root.xaxis.major_label_orientation = XLABEL_ORIENTATION
self.root.xaxis.minor_tick_line_alpha = 0
self.root.x_range = Range1d(start=0)
self.root.yaxis.visible = False
self.root.ygrid.visible = False
self.root.toolbar_location = None

tap = TapTool(callback=OpenURL(url="./info/worker/@escaped_worker.html"))
self.root.add_tools(tap)

hover = HoverTool(
point_policy="follow_mouse",
tooltips="""
<div>
<span style="font-size: 12px; font-weight: bold;">Worker:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@worker</span>
</div>
<div>
<span style="font-size: 12px; font-weight: bold;">RMM memory used:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@rmm_used{0.00 b} / @rmm_total{0.00 b}</span>
</div>
<div>
<span style="font-size: 12px; font-weight: bold;">GPU memory used:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@gpu_used{0.00 b} / @gpu_total{0.00 b}</span>
</div>
<div>
<span style="font-size: 12px; font-weight: bold;">Spilled to CPU:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@spilled{0.00 b}</span>
</div>
""",
)
self.root.add_tools(hover)

@without_property_validation
@log_errors
def update(self):
with log_errors():
workers = list(self.scheduler.workers.values())
rmm_total = []
rmm_used = []
external_used = []
gpu_index = []
y = []
worker = []
external_used_x = []
memory_max = 0
gpu_total = []
rmm_memory_text = []

for idx, ws in enumerate(workers):
try:
rmm_metrics = ws.metrics["rmm"]
gpu_metrics = ws.metrics["gpu"]
gpu_info = ws.extra["gpu"]
except KeyError:
continue
rmm_total_worker = rmm_metrics["rmm-total"] # RMM memory only
rmm_used_worker = rmm_metrics["rmm-used"]
gpu_total_worker = gpu_info["memory-total"] # All GPU memory
gpu_used_worker = gpu_metrics["memory-used"]

external_used_worker = gpu_used_worker - rmm_total_worker

rmm_total.append(rmm_total_worker)
rmm_used.append(rmm_used_worker)
gpu_total.append(gpu_total_worker)
external_used.append(external_used_worker)
external_used_x.append(rmm_total_worker + external_used_worker / 2)
worker.append(ws.address)
gpu_index.append(idx)
y.append(idx)

memory_max = max(memory_max, gpu_total_worker)

rmm_memory_text.append(
"RMM memory used: {}/{}\nTotal GPU memory used: {}/{}".format(
format_bytes(rmm_used_worker),
format_bytes(rmm_total_worker),
format_bytes(gpu_used_worker),
format_bytes(gpu_total_worker),
)
)

self.memory_figure.title.text = dedent(
"""\
RMM Utilization: {} / {}
GPU Memory: {} / {}
""".format(
format_bytes(sum(rmm_used)),
format_bytes(sum(rmm_total)),
format_bytes(sum([*rmm_total, *external_used])),
format_bytes(sum(gpu_total)),
)
def quadlist(i: Iterable[T]) -> list[T]:
out = []
for ii in i:
out += [ii, ii, ii, ii]
return out

workers = list(self.scheduler.workers.values())

width = []
x = []
color = []
max_limit = 0
rmm_used = []
rmm_total = []
gpu_used = []
gpu_total = []
spilled = []

for ws in workers:
try:
rmm_metrics = ws.metrics["rmm"]
gpu_metrics = ws.metrics["gpu"]
gpu_info = ws.extra["gpu"]
cudf_metrics = ws.metrics["cudf"]
except KeyError:
continue

rmm_used_worker = rmm_metrics["rmm-used"] # RMM memory only
rmm_total_worker = rmm_metrics["rmm-total"]
gpu_used_worker = gpu_metrics["memory-used"] # All GPU memory
gpu_total_worker = gpu_info["memory-total"]
spilled_worker = cudf_metrics["cudf-spilled"] # memory spilled to host

max_limit = max(
max_limit, gpu_total_worker, gpu_used_worker + spilled_worker
)

result = {
"rmm-total": rmm_total,
"rmm-used": rmm_used,
"external-used": external_used,
"rmm-total-half": [m // 2 for m in rmm_total],
"rmm-used-half": [m // 2 for m in rmm_used],
"external-used-x": external_used_x,
"worker": worker,
"gpu-index": gpu_index,
"y": y,
"escaped_worker": [escape.url_escape(w) for w in worker],
"rmm_memory_text": rmm_memory_text,
}

self.memory_figure.x_range.end = memory_max

update(self.source, result)


color_i = self._memory_color(gpu_used_worker, gpu_total_worker, ws.status)

width += [
rmm_used_worker,
rmm_total_worker - rmm_used_worker,
gpu_used_worker - rmm_total_worker,
spilled_worker,
]
x += [sum(width[-4:i]) + width[i] / 2 for i in range(-4, 0)]
color += [color_i, color_i, color_i, "grey"]

# memory info
rmm_used.append(rmm_used_worker)
rmm_total.append(rmm_total_worker)
gpu_used.append(gpu_used_worker)
gpu_total.append(gpu_total_worker)
spilled.append(spilled_worker)

title = f"RMM memory used: {format_bytes(sum(rmm_used))} / {format_bytes(sum(rmm_total))}\nGPU memory used: {format_bytes(sum(gpu_used))} / {format_bytes(sum(gpu_total))}"
if sum(spilled):
title += f" + {format_bytes(sum(spilled))} spilled to CPU"
self.root.title.text = title

result = {
"width": width,
"x": x,
"y": quadlist(range(len(workers))),
"color": color,
"alpha": [1, 0.7, 0.4, 1] * len(workers),
"worker": quadlist(ws.address for ws in workers),
"escaped_worker": quadlist(escape.url_escape(ws.address) for ws in workers),
"rmm_used": quadlist(rmm_used),
"rmm_total": quadlist(rmm_total),
"gpu_used": quadlist(gpu_used),
"gpu_total": quadlist(gpu_total),
"spilled": quadlist(spilled),
}

self.root.x_range.end = max_limit
update(self.source, result)


@log_errors
def rmm_memory_doc(scheduler, extra, doc):
with log_errors():
rmm_load = RMMMemoryUsage(scheduler, sizing_mode="stretch_both")
rmm_load.update()
add_periodic_callback(doc, rmm_load, 100)
doc.add_root(rmm_load.memory_figure)
doc.theme = BOKEH_THEME
rmm_load = RMMMemoryUsage(scheduler, sizing_mode="stretch_both")
rmm_load.update()
add_periodic_callback(doc, rmm_load, 100)
doc.add_root(rmm_load.root)
doc.theme = BOKEH_THEME
Loading