Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tools 2679 show stop writes bug #210

Merged
merged 13 commits into from
Oct 6, 2023
4 changes: 2 additions & 2 deletions lib/health/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,13 +407,13 @@
stop_writes = select "stop_writes" from NAMESPACE.STATISTICS;
stop_writes = group by CLUSTER, NAMESPACE stop_writes;
ASSERT(stop_writes, False, "Namespace has hit stop-writes (stop_writes = true)", "OPERATIONS" , CRITICAL,
"Listed namespace(s) have hit stop-write. Please run 'show statistics namespace like stop_writes' for details.",
"Listed namespace(s) have hit stop-write. Please run 'show stop-writes' for details.",
"Namespace stop-writes flag check.");

clock_skew_stop_writes = select "clock_skew_stop_writes" from NAMESPACE.STATISTICS;
clock_skew_stop_writes = group by CLUSTER, NAMESPACE clock_skew_stop_writes;
ASSERT(clock_skew_stop_writes, False, "Namespace has hit clock-skew-stop-writes (clock_skew_stop_writes = true)", "OPERATIONS" , CRITICAL,
"Listed namespace(s) have hit clock-skew-stop-writes. Please run 'show statistics namespace like clock_skew_stop_writes' for details.",
"Listed namespace(s) have hit clock-skew-stop-writes. Please run 'show stop-writes' for details.",
"Namespace clock-skew-stop-writes flag check.");

SET CONSTRAINT VERSION < 4.3;
Expand Down
169 changes: 113 additions & 56 deletions lib/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1649,39 +1649,81 @@ def _create_stop_writes_entry(

@staticmethod
def _is_stop_writes_cause(
usage: int | float, threshold: int | float, stop_writes: str | None = None
usage: int | float,
threshold: int | float,
stop_writes: str | None = None,
invert: bool = False,
):
if threshold == 0:
return False

if invert:
return (
True
if usage <= threshold
and (stop_writes is None or stop_writes.lower() == "true")
dwelch-spike marked this conversation as resolved.
Show resolved Hide resolved
else False
)
return (
True
if usage >= threshold and (stop_writes is None or stop_writes.lower() == "true")
else False
)


@staticmethod
jdogmcsteezy marked this conversation as resolved.
Show resolved Hide resolved
def _get_first_value_from_dict_with_key(
dict_: dict[str, Any],
key: str | tuple,
default_value: Any = None,
return_type: type = str,
) -> Any:
jdogmcsteezy marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(key, str):
key = (key,)

for key in key:
if key in dict_:
return key, util.get_value_from_dict(
dict_, key, default_value=default_value, return_type=return_type
)

return None, None


@staticmethod
def _format_ns_stop_writes_metrics(
stop_writes_metrics: StopWritesDict,
service_stats,
ns_stats,
):
for node in service_stats:
cluster_clock_skew_ms = service_stats[node].get("cluster_clock_skew_ms", None)
cluster_clock_skew_stop_writes_sec = service_stats[node].get(
"cluster_clock_skew_stop_writes_sec", None
cluster_clock_skew_ms: int | None = util.get_value_from_dict(
service_stats[node],
"cluster_clock_skew_ms",
None,
return_type=int,
)
cluster_clock_skew_stop_writes_sec: int | None = util.get_value_from_dict(
service_stats[node],
"cluster_clock_skew_stop_writes_sec",
None,
return_type=int,
)
system_free_mem_pct: int | None = util.get_value_from_dict(
service_stats[node],
"system_free_mem_pct",
None,
return_type=int,
)
system_free_mem_pct = service_stats[node].get("system_free_mem_pct", None)

for ns, stats in ns_stats.get(node, {}).items():
# There is no config for this trigger
strong_consistency: str | None = stats.get("strong-consistency", None)
nsup_period: str | None = stats.get("nsup-period", None)
stop_writes: str | None = stats.get("clock_skew_stop_writes", None)
metric: str = "cluster_clock_skew_ms"
usage = cluster_clock_skew_ms
threshold = cluster_clock_skew_stop_writes_sec
metric = "cluster_clock_skew_ms"
usage: int | float | None = cluster_clock_skew_ms
threshold: int | float | None = cluster_clock_skew_stop_writes_sec

"""
For Available mode (AP) namespaces running versions 4.5.1 or above and where
Expand All @@ -1694,34 +1736,32 @@ def _format_ns_stop_writes_metrics(
and nsup_period is not None # nsup-period was added in 4.5.1.
and nsup_period != "0"
):
thresh = 40000
threshold = 40000
else:
thresh = (
int(cluster_clock_skew_stop_writes_sec) * 1000
) # convert to ms
use = int(usage)
sw = _is_stop_writes_cause(use, thresh, stop_writes)
threshold = int(threshold) * 1000 # convert to ms

sw = _is_stop_writes_cause(usage, threshold, stop_writes)
_create_stop_writes_entry(
stop_writes_metrics[node],
metric,
use,
usage,
sw,
thresh,
threshold,
namespace=ns,
)

stop_writes: str | None = stats.get("stop_writes", None)
metric = "system_free_mem_pct"
config = "stop-writes-sys-memory-pct"
threshold: str | None = stats.get(config, None)
threshold = util.get_value_from_dict(stats, config, None, return_type=int)

if (
threshold is not None
and system_free_mem_pct is not None
and stop_writes is not None
):
thresh = int(threshold)
use = 100 - int(system_free_mem_pct)
thresh = threshold
use = 100 - system_free_mem_pct
sw = _is_stop_writes_cause(use, thresh, stop_writes)
_create_stop_writes_entry(
stop_writes_metrics[node],
Expand All @@ -1740,70 +1780,87 @@ def _format_ns_stop_writes_metrics(
if stop_writes is None:
continue

metric = "device_avail_pct"
config = "min-avail-pct"
usage: str | None = stats.get(metric, None)
threshold: str | None = stats.get(config, None)

if usage is None:
metric = "pmem_avail_pct"
usage = stats.get(metric, None)
metric, usage = _get_first_value_from_dict_with_key(
stats,
("data_avail_pct", "device_available_pct", "pmem_available_pct"),
default_value=None,
return_type=int,
)
config, threshold = _get_first_value_from_dict_with_key(
stats,
(
"storage-engine.stop-writes-avail-pct",
"storage-engine.min-avail-pct",
),
default_value=None,
return_type=int,
)

if usage is not None and threshold is not None:
use = int(usage)
thresh = int(threshold)
sw = _is_stop_writes_cause(use, thresh, stop_writes)
if metric and usage is not None and threshold is not None:
jdogmcsteezy marked this conversation as resolved.
Show resolved Hide resolved
sw = _is_stop_writes_cause(usage, threshold, stop_writes, invert=True)
_create_stop_writes_entry(
stop_writes_metrics[node],
metric,
use,
usage,
sw,
thresh,
threshold,
config=config,
namespace=ns,
)

metric = "device_used_bytes"
config = "max-used-pct"
usage: str | None = stats.get(metric, None)
bytes_total: str | None = stats.get("device_total_bytes", None)
threshold: str | None = stats.get(config, None)

if usage is None:
metric = "pmem_used_bytes"
usage = stats.get(metric, None)
bytes_total = stats.get("pmem_total_bytes", None)
metric, usage = _get_first_value_from_dict_with_key(
stats,
("data_used_bytes", "device_used_bytes", "pmem_used_bytes"),
default_value=None,
return_type=int,
)
config, threshold = _get_first_value_from_dict_with_key(
stats,
("storage-engine.stop-writes-used-pct", "storage-engine.max-used-pct"),
default_value=None,
return_type=int,
)
bytes_total: int | float | None = util.get_value_from_dict(
stats,
("data_total_bytes", "device_total_bytes", "pmem_total_bytes"),
None,
return_type=int,
)

if usage is not None and threshold is not None and bytes_total is not None:
jdogmcsteezy marked this conversation as resolved.
Show resolved Hide resolved
use = int(usage)
thresh = int(bytes_total) * (int(threshold) / 100)
sw = _is_stop_writes_cause(use, thresh, stop_writes)
threshold = bytes_total * (threshold / 100)
sw = _is_stop_writes_cause(usage, threshold, stop_writes)
_create_stop_writes_entry(
stop_writes_metrics[node],
metric,
use,
usage,
sw,
thresh,
threshold,
config=config,
namespace=ns,
)

metric = "memory_used_bytes"
config = "stop-writes-pct"
usage: str | None = stats.get(metric, None)
bytes_total: str | None = stats.get("memory-size", None)
threshold: str | None = stats.get(config, None)
usage = util.get_value_from_dict(
stats, metric, default_value=None, return_type=int
)
bytes_total = util.get_value_from_dict(
stats, "memory-size", default_value=None, return_type=int
)
threshold = util.get_value_from_dict(
stats, config, default_value=None, return_type=int
)

if usage is not None and threshold is not None and bytes_total is not None:
use = int(usage)
thresh = int(bytes_total) * (int(threshold) / 100)
sw = _is_stop_writes_cause(use, thresh, stop_writes)
threshold = int(bytes_total) * (int(threshold) / 100)
jdogmcsteezy marked this conversation as resolved.
Show resolved Hide resolved
sw = _is_stop_writes_cause(usage, threshold, stop_writes)
_create_stop_writes_entry(
stop_writes_metrics[node],
metric,
use,
usage,
sw,
thresh,
threshold,
config=config,
namespace=ns,
)
Expand Down
13 changes: 8 additions & 5 deletions lib/view/sheet/decleration.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,16 +147,19 @@ def fun(edata: EntryValue):
return fun

@staticmethod
def _fmt_pct_type(val: float):
def _fmt_pct_type(val: float, invert: bool = False):
if invert:
val = 100 - val

return str(round(float(val), 2)) + " %"

@staticmethod
def ratio_to_pct(edata: EntryValue):
return Converters._fmt_pct_type(edata.value * 100)
def ratio_to_pct(edata: EntryValue, invert: bool = False):
return Converters._fmt_pct_type(edata.value * 100, invert)

@staticmethod
def pct(edata: EntryValue):
return Converters._fmt_pct_type(edata.value)
def pct(edata: EntryValue, invert: bool = False):
return Converters._fmt_pct_type(edata.value, invert)


FormatterPredicateFnType = Callable[[EntryData], bool]
Expand Down
33 changes: 27 additions & 6 deletions lib/view/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -1677,8 +1677,14 @@ def stop_writes_converter_selector(edata: EntryData):
return None

metric = edata.record["Metric"]
val = ""

if "pct" in metric:
if "avail" in metric:
val = Converters.pct(edata, invert=True)
val = "(inverted) " + val
return val

return Converters.pct(edata)
if "bytes" in metric:
return Converters.byte(edata)
Expand All @@ -1688,6 +1694,25 @@ def stop_writes_converter_selector(edata: EntryData):
return Converters.scientific_units(edata)


class StopWritesUsagePctProjector(Projectors.Number):
def __init__(self, source, *keys, **kwargs):
"""
Keyword Arguments:
invert -- False by default, if True will return 100 - value.
"""
super().__init__(source, *keys, **kwargs)
self.invert = kwargs.get("invert", False)

def do_project(self, sheet, sources):
data = sources.get("stop_writes", ((), {}))[1]
val = super().do_project(sheet, sources)

if "metric" in data and "avail" in data["metric"]:
val = 100 - val

return _ignore_zero(val)


sw_row_yellow_format = (
Formatters.yellow_alert(lambda edata: edata.record["Stop-Writes"] == True),
)
Expand Down Expand Up @@ -1738,12 +1763,8 @@ def stop_writes_converter_selector(edata: EntryData):
Field(
"Usage%",
Projectors.Div(
Projectors.Number("stop_writes", "metric_usage"),
Projectors.Func(
FieldType.number,
_ignore_zero,
Projectors.Number("stop_writes", "metric_threshold"),
),
StopWritesUsagePctProjector("stop_writes", "metric_usage"),
StopWritesUsagePctProjector("stop_writes", "metric_threshold"),
),
converter=Converters.ratio_to_pct,
formatters=sw_val_red_format + sw_val_yellow_format + sw_row_yellow_format,
Expand Down
Loading