Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cuDF spilling statistics to RMM/GPU memory plot #8148

Merged
merged 26 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7fc10ce
Initial exposure of cuDF logging information
charlesbluca Aug 30, 2023
04137ef
Initial plot of GPU to CPU nbytes
charlesbluca Aug 31, 2023
d38de06
Refactor RMM plot to include spilled memory
charlesbluca Sep 1, 2023
eeddf1e
Fix memory limit on x axis
charlesbluca Sep 1, 2023
043835c
Remove unused dashboard plot
charlesbluca Sep 1, 2023
0ac3344
Allow MemoryColor colors to be overridden
charlesbluca Sep 6, 2023
698be13
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 6, 2023
bb49135
Linting
charlesbluca Sep 6, 2023
628fe39
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 7, 2023
50e626a
Add cudf diagnostics test
charlesbluca Sep 7, 2023
98e283e
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 11, 2023
008cca8
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 12, 2023
082ddef
Resolve bokeh test failures
charlesbluca Sep 12, 2023
87f8020
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 19, 2023
4453b8b
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 26, 2023
b60173d
Make cudf spilling monitoring optional and disabled by default
charlesbluca Sep 28, 2023
7426548
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Sep 28, 2023
1890136
Modify cudf spilling test
charlesbluca Sep 28, 2023
5eafddc
Test cuDF spill tests in separate process
charlesbluca Sep 29, 2023
21e106b
Remove global cuDF spilling settings from build.sh
charlesbluca Oct 2, 2023
3cc4b94
cuDF metrics test is flaky
charlesbluca Oct 3, 2023
98dbfc7
Merge remote-tracking branch 'origin/main' into cudf-spilling-dashboard
charlesbluca Oct 10, 2023
a5fce3c
Merge remote-tracking branch 'upstream/main' into cudf-spilling-dashb…
charlesbluca Oct 25, 2023
b2fdfc6
Shouldn't need dask-cuda worker for test
charlesbluca Oct 25, 2023
f96b8a4
Merge remote-tracking branch 'upstream/main' into pr/charlesbluca/8148
charlesbluca Dec 11, 2023
936f0f6
Merge remote-tracking branch 'upstream/main' into cudf-spilling-dashb…
charlesbluca Dec 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions continuous_integration/gpuci/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,7 @@ conda list --show-channel-urls

rapids-logger "Python py.test for distributed"
py.test distributed -v -m gpu --runslow --junitxml="$WORKSPACE/junit-distributed.xml"

# cuDF spill stats monitoring must be enabled for this test
CUDF_SPILL=on CUDF_SPILL_STATS=1 DASK_DISTRIBUTED__DIAGNOSTICS__CUDF=1 \
py.test distributed/diagnostics/tests/test_cudf_diagnostics.py -v -m gpu --runslow --junitxml="$WORKSPACE/junit-distributed.xml"
262 changes: 134 additions & 128 deletions distributed/dashboard/components/rmm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import math
from textwrap import dedent
from collections.abc import Iterable
from typing import TypeVar

from bokeh.core.properties import without_property_validation
from bokeh.models import (
Expand All @@ -10,6 +10,7 @@
HoverTool,
NumeralTickFormatter,
OpenURL,
Range1d,
TapTool,
)
from bokeh.plotting import figure
Expand All @@ -18,12 +19,19 @@
from dask.utils import format_bytes

from distributed.dashboard.components import DashboardComponent, add_periodic_callback
from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024
from distributed.dashboard.components.scheduler import (
BOKEH_THEME,
TICKS_1024,
XLABEL_ORIENTATION,
MemoryColor,
)
from distributed.dashboard.utils import update
from distributed.utils import log_errors

T = TypeVar("T")


class RMMMemoryUsage(DashboardComponent):
class RMMMemoryUsage(DashboardComponent, MemoryColor):
"""
GPU memory usage plot that includes information about memory
managed by RMM. If an RMM pool is being used, shows the amount of
Expand All @@ -32,168 +40,166 @@

@log_errors
def __init__(self, scheduler, width=600, **kwargs):
DashboardComponent.__init__(self)
MemoryColor.__init__(self, neutral_color="#76B900")

self.last = 0
self.scheduler = scheduler
self.source = ColumnDataSource(
{
"rmm-used": [1, 2],
"rmm-used-half": [0.5, 1],
"rmm-total": [2, 4],
"rmm-total-half": [1, 2],
"external-used": [2, 1],
"external-used-x": [3, 4.5],
"worker": ["a", "b"],
"gpu-index": [0, 0],
"y": [1, 2],
"escaped_worker": ["a", "b"],
"rmm_memory_text": [
"RMM memory used: 1B/1B\nTotal GPU memory used: 1B/2B",
"RMM memory used: 1B/1B\nTotal GPU memory used: 1B/2B",
],
"width": [],
"x": [],
"y": [],
"color": [],
"alpha": [],
"worker": [],
"escaped_worker": [],
"rmm_used": [],
"rmm_total": [],
"gpu_used": [],
"gpu_total": [],
"spilled": [],
}
)

memory = figure(
title="RMM Memory",
self.root = figure(
title="RMM memory used",
tools="",
width=int(width / 2),
name="rmm_memory_histogram",
name="rmm_memory",
**kwargs,
)

rect = memory.rect(
source=self.source,
x="rmm-used-half",
y="y",
width="rmm-used",
height=1,
color="#76B900",
alpha=1.0,
)
rect.nonselection_glyph = None

rect = memory.rect(
rect = self.root.rect(
source=self.source,
x="rmm-total-half",
x="x",
y="y",
width="rmm-total",
height=1,
color="#76B900",
alpha=0.75,
width="width",
height=0.9,
color="color",
fill_alpha="alpha",
line_width=0,
)
rect.nonselection_glyph = None

rect = memory.rect(
source=self.source,
x="external-used-x",
y="y",
width="external-used",
height=1,
color="#76B900",
alpha=0.5,
self.root.axis[0].ticker = BasicTicker(**TICKS_1024)
self.root.xaxis[0].formatter = NumeralTickFormatter(format="0.0 b")
self.root.xaxis.major_label_orientation = XLABEL_ORIENTATION
self.root.xaxis.minor_tick_line_alpha = 0
self.root.x_range = Range1d(start=0)
self.root.yaxis.visible = False
self.root.ygrid.visible = False
self.root.toolbar_location = None

tap = TapTool(callback=OpenURL(url="./info/worker/@escaped_worker.html"))
self.root.add_tools(tap)

hover = HoverTool(
point_policy="follow_mouse",
tooltips="""
<div>
<span style="font-size: 12px; font-weight: bold;">Worker:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@worker</span>
</div>
<div>
<span style="font-size: 12px; font-weight: bold;">RMM memory used:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@rmm_used{0.00 b} / @rmm_total{0.00 b}</span>
</div>
<div>
<span style="font-size: 12px; font-weight: bold;">GPU memory used:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@gpu_used{0.00 b} / @gpu_total{0.00 b}</span>
</div>
<div>
<span style="font-size: 12px; font-weight: bold;">Spilled to CPU:</span>&nbsp;
<span style="font-size: 10px; font-family: Monaco, monospace;">@spilled{0.00 b}</span>
</div>
""",
)
rect.nonselection_glyph = None

memory.axis[0].ticker = BasicTicker(**TICKS_1024)
memory.xaxis[0].formatter = NumeralTickFormatter(format="0.0 b")
memory.xaxis.major_label_orientation = -math.pi / 12
memory.x_range.start = 0

for fig in [memory]:
fig.xaxis.minor_tick_line_alpha = 0
fig.yaxis.visible = False
fig.ygrid.visible = False

tap = TapTool(callback=OpenURL(url="./info/worker/@escaped_worker.html"))
fig.add_tools(tap)

fig.toolbar_location = None
fig.yaxis.visible = False

hover = HoverTool()
hover.tooltips = "@worker : @rmm_memory_text"
hover.point_policy = "follow_mouse"
memory.add_tools(hover)

self.memory_figure = memory
self.root.add_tools(hover)

@without_property_validation
@log_errors
def update(self):
def quadlist(i: Iterable[T]) -> list[T]:
out = []
for ii in i:
out += [ii, ii, ii, ii]
return out

workers = list(self.scheduler.workers.values())
rmm_total = []

width = []
x = []
color = []
max_limit = 0
rmm_used = []
external_used = []
gpu_index = []
y = []
worker = []
external_used_x = []
memory_max = 0
rmm_total = []
gpu_used = []
gpu_total = []
rmm_memory_text = []
spilled = []

for idx, ws in enumerate(workers):
for ws in workers:
try:
rmm_metrics = ws.metrics["rmm"]
gpu_metrics = ws.metrics["gpu"]
gpu_info = ws.extra["gpu"]
except KeyError:
continue
rmm_total_worker = rmm_metrics["rmm-total"] # RMM memory only
rmm_used_worker = rmm_metrics["rmm-used"]
gpu_total_worker = gpu_info["memory-total"] # All GPU memory
gpu_used_worker = gpu_metrics["memory-used"]
rmm_metrics = {"rmm-used": 0, "rmm-total": 0}
gpu_metrics = {"memory-used": 0}
gpu_info = {"memory-total": 0}

try:
cudf_metrics = ws.metrics["cudf"]
except KeyError:
cudf_metrics = {"cudf-spilled": 0}

external_used_worker = gpu_used_worker - rmm_total_worker
rmm_used_worker = rmm_metrics["rmm-used"] # RMM memory only
rmm_total_worker = rmm_metrics["rmm-total"]
gpu_used_worker = gpu_metrics["memory-used"] # All GPU memory
gpu_total_worker = gpu_info["memory-total"]
spilled_worker = cudf_metrics["cudf-spilled"] or 0 # memory spilled to host

rmm_total.append(rmm_total_worker)
max_limit = max(
max_limit, gpu_total_worker, gpu_used_worker + spilled_worker
)
color_i = self._memory_color(gpu_used_worker, gpu_total_worker, ws.status)

width += [
rmm_used_worker,
rmm_total_worker - rmm_used_worker,
gpu_used_worker - rmm_total_worker,
spilled_worker,
]
x += [sum(width[-4:i]) + width[i] / 2 for i in range(-4, 0)]
color += [color_i, color_i, color_i, "grey"]

# memory info
rmm_used.append(rmm_used_worker)
rmm_total.append(rmm_total_worker)
gpu_used.append(gpu_used_worker)
gpu_total.append(gpu_total_worker)
external_used.append(external_used_worker)
external_used_x.append(rmm_total_worker + external_used_worker / 2)
worker.append(ws.address)
gpu_index.append(idx)
y.append(idx)

memory_max = max(memory_max, gpu_total_worker)

rmm_memory_text.append(
"RMM memory used: {}/{}\nTotal GPU memory used: {}/{}".format(
format_bytes(rmm_used_worker),
format_bytes(rmm_total_worker),
format_bytes(gpu_used_worker),
format_bytes(gpu_total_worker),
)
)
spilled.append(spilled_worker)

self.memory_figure.title.text = dedent(
"""\
RMM Utilization: {} / {}
GPU Memory: {} / {}
""".format(
format_bytes(sum(rmm_used)),
format_bytes(sum(rmm_total)),
format_bytes(sum([*rmm_total, *external_used])),
format_bytes(sum(gpu_total)),
)
)
title = f"RMM memory used: {format_bytes(sum(rmm_used))} / {format_bytes(sum(rmm_total))}\nGPU memory used: {format_bytes(sum(gpu_used))} / {format_bytes(sum(gpu_total))}"
if sum(spilled):
title += f" + {format_bytes(sum(spilled))} spilled to CPU"

Check warning on line 184 in distributed/dashboard/components/rmm.py

View check run for this annotation

Codecov / codecov/patch

distributed/dashboard/components/rmm.py#L184

Added line #L184 was not covered by tests
self.root.title.text = title

result = {
"rmm-total": rmm_total,
"rmm-used": rmm_used,
"external-used": external_used,
"rmm-total-half": [m // 2 for m in rmm_total],
"rmm-used-half": [m // 2 for m in rmm_used],
"external-used-x": external_used_x,
"worker": worker,
"gpu-index": gpu_index,
"y": y,
"escaped_worker": [escape.url_escape(w) for w in worker],
"rmm_memory_text": rmm_memory_text,
"width": width,
"x": x,
"y": quadlist(range(len(workers))),
"color": color,
"alpha": [1, 0.7, 0.4, 1] * len(workers),
"worker": quadlist(ws.address for ws in workers),
"escaped_worker": quadlist(escape.url_escape(ws.address) for ws in workers),
"rmm_used": quadlist(rmm_used),
"rmm_total": quadlist(rmm_total),
"gpu_used": quadlist(gpu_used),
"gpu_total": quadlist(gpu_total),
"spilled": quadlist(spilled),
}

self.memory_figure.x_range.end = memory_max

self.root.x_range.end = max_limit
update(self.source, result)


Expand All @@ -202,5 +208,5 @@
rmm_load = RMMMemoryUsage(scheduler, sizing_mode="stretch_both")
rmm_load.update()
add_periodic_callback(doc, rmm_load, 100)
doc.add_root(rmm_load.memory_figure)
doc.add_root(rmm_load.root)
doc.theme = BOKEH_THEME
19 changes: 13 additions & 6 deletions distributed/dashboard/components/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,25 +276,32 @@ class MemoryColor:
orange: float
red: float

def __init__(self):
def __init__(
self, neutral_color="blue", target_color="orange", terminated_color="red"
):
self.neutral_color = neutral_color
self.target_color = target_color
self.terminated_color = terminated_color

target = dask.config.get("distributed.worker.memory.target")
spill = dask.config.get("distributed.worker.memory.spill")
terminate = dask.config.get("distributed.worker.memory.terminate")
Comment on lines 286 to 288
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming these variables don't come up when deciding GPU to CPU spilling and can probably be ignored in that case, not sure what the best way to do this is? First thought was to just use a context manager to override config when initializing the MemoryColor, not sure if that will screw up AMM behavior down the line


# These values can be False. It's also common to configure them to impossibly
# high values to achieve the same effect.
self.orange = min(target or math.inf, spill or math.inf)
self.red = min(terminate or math.inf, 1.0)

def _memory_color(self, current: int, limit: int, status: Status) -> str:
if status != Status.running:
return "red"
return self.terminated_color
if not limit:
return "blue"
return self.neutral_color
if current >= limit * self.red:
return "red"
return self.terminated_color
if current >= limit * self.orange:
return "orange"
return "blue"
return self.target_color
return self.neutral_color


class ClusterMemory(DashboardComponent, MemoryColor):
Expand Down
Loading
Loading