Skip to content

Add Rocestat PMDA for collecting and analyzing RoCE device metrics #2132

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion build/rpm/pcp.spec.in
Original file line number Diff line number Diff line change
Expand Up @@ -1570,6 +1570,20 @@ This package contains the PCP Performance Metrics Domain Agent (PMDA) for
collecting metrics from simple network checks.
# end pcp-pmda-netcheck

#
# pcp-pmda-rocestat
#
%package pmda-rocestat
License: GPL-2.0-or-later
Summary: Performance Co-Pilot (PCP) metrics for nVidia RoCE devices
URL: https://pcp.io
Requires: pcp = @package_version@ pcp-libs = @package_version@
Requires: python3-pcp
%description pmda-rocestat
This package contains the PCP Performance Metrics Domain Agent (PMDA) for
collecting statistics for nVidia RDMA over Converged Ethernet (RoCE) devices.
# end pcp-pmda-rocestat

#
# pcp-pmda-openvswitch
#
Expand Down Expand Up @@ -2242,6 +2256,7 @@ basic_manifest | keep '(etc/pcp|pmdas)/postgresql(/|$)' >pcp-pmda-postgresql-fil
basic_manifest | keep '(etc/pcp|pmdas)/rabbitmq(/|$)' >pcp-pmda-rabbitmq-files
basic_manifest | keep '(etc/pcp|pmdas)/redis(/|$)' >pcp-pmda-redis-files
basic_manifest | keep '(etc/pcp|pmdas)/resctrl(/|$)|sys-fs-resctrl' >pcp-pmda-resctrl-files
basic_manifest | keep '(etc/pcp|pmdas)/rocestat(/|$)' >pcp-pmda-rocestat-files
basic_manifest | keep '(etc/pcp|pmdas)/roomtemp(/|$)' >pcp-pmda-roomtemp-files
basic_manifest | keep '(etc/pcp|pmdas)/rsyslog(/|$)' >pcp-pmda-rsyslog-files
basic_manifest | keep '(etc/pcp|pmdas)/samba(/|$)' >pcp-pmda-samba-files
Expand Down Expand Up @@ -2279,7 +2294,7 @@ for pmda_package in \
nutcracker nvidia \
openmetrics openvswitch oracle \
pdns perfevent podman postfix postgresql \
rabbitmq redis resctrl roomtemp rsyslog \
rabbitmq redis resctrl rocestat roomtemp rsyslog \
samba sendmail shping slurm smart snmp \
sockets statsd summary systemd \
unbound uwsgi \
Expand Down Expand Up @@ -2696,6 +2711,9 @@ done
%preun pmda-rabbitmq
%{pmda_remove "$1" "rabbitmq"}

%preun pmda-rocestat
%{pmda_remove "$1" "rocestat"}

%preun pmda-uwsgi
%{pmda_remove "$1" "uwsgi"}
%endif
Expand Down Expand Up @@ -3020,6 +3038,8 @@ fi

%files pmda-rabbitmq -f pcp-pmda-rabbitmq-files.rpm

%files pmda-rocestat -f pcp-pmda-rocestat-files.rpm

%files pmda-uwsgi -f pcp-pmda-uwsgi-files.rpm

%files export-pcp2graphite -f pcp-export-pcp2graphite-files.rpm
Expand Down
27 changes: 26 additions & 1 deletion build/rpm/redhat.spec
Original file line number Diff line number Diff line change
Expand Up @@ -1729,6 +1729,25 @@ collecting metrics from simple network checks.
# end pcp-pmda-netcheck
%endif

#
# pcp-pmda-rocestat
#
%package pmda-rocestat
License: GPL-2.0-or-later
Summary: Performance Co-Pilot (PCP) metrics for nVidia RoCE devices
URL: https://pcp.io
Requires: pcp = %{version}-%{release} pcp-libs = %{version}-%{release}
%if !%{disable_python3}
Requires: python3-pcp
%else
Requires: %{__python2}-pcp
%endif
%description pmda-rocestat
This package contains the PCP Performance Metrics Domain Agent (PMDA) for
collecting statistics for nVidia RDMA over Converged Ethernet (RoCE) devices.
# end pcp-pmda-rocestat
%endif

%if !%{disable_mongodb}
#
# pcp-pmda-mongodb
Expand Down Expand Up @@ -2449,6 +2468,7 @@ basic_manifest | keep '(etc/pcp|pmdas)/postgresql(/|$)' >pcp-pmda-postgresql-fil
basic_manifest | keep '(etc/pcp|pmdas)/rabbitmq(/|$)' >pcp-pmda-rabbitmq-files
basic_manifest | keep '(etc/pcp|pmdas)/redis(/|$)' >pcp-pmda-redis-files
basic_manifest | keep '(etc/pcp|pmdas)/resctrl(/|$)|sys-fs-resctrl' >pcp-pmda-resctrl-files
basic_manifest | keep '(etc/pcp|pmdas)/rocestat(/|$)' >pcp-pmda-rocestat-files
basic_manifest | keep '(etc/pcp|pmdas)/roomtemp(/|$)' >pcp-pmda-roomtemp-files
basic_manifest | keep '(etc/pcp|pmdas)/rpm(/|$)' >pcp-pmda-rpm-files
basic_manifest | keep '(etc/pcp|pmdas)/rsyslog(/|$)' >pcp-pmda-rsyslog-files
Expand Down Expand Up @@ -2487,7 +2507,7 @@ for pmda_package in \
nutcracker nvidia \
openmetrics openvswitch oracle \
pdns perfevent podman postfix postgresql \
rabbitmq redis resctrl roomtemp rpm rsyslog \
rabbitmq redis resctrl rocestat roomtemp rpm rsyslog \
samba sendmail shping slurm smart snmp \
sockets statsd summary systemd \
unbound uwsgi \
Expand Down Expand Up @@ -2865,6 +2885,9 @@ exit 0
%preun pmda-netcheck
%{pmda_remove "$1" "netcheck"}

%preun pmda-rocestat
%{pmda_remove "$1" "rocestat"}

%endif

%preun pmda-apache
Expand Down Expand Up @@ -3209,6 +3232,8 @@ fi

%files pmda-rabbitmq -f pcp-pmda-rabbitmq-files.rpm

%files pmda-rocestat -f pcp-pmda-rocestat-files.rpm

%files pmda-uwsgi -f pcp-pmda-uwsgi-files.rpm

%files export-pcp2graphite -f pcp-export-pcp2graphite-files.rpm
Expand Down
87 changes: 87 additions & 0 deletions qa/1993
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/bin/sh
# PCP QA Test No. 1993
# Testing PCP Rocestat PMDA - install, remove and values.
#
# Copyright (c) 2025 Oracle and/or its affiliates.
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.
#

seq=`basename $0`
echo "QA output created by $seq"

. ./common.python

pmda_path="$PCP_PMDAS_DIR/rocestat"
pmda_script="$pmda_path/pmdarocestat.python"
test_script="rocestat/rocestat_test.python"

[ -d $pmda_path ] || _notrun "ROCESTAT PMDA is not installed"

$python -c "from pcp import pmda" >/dev/null 2>&1
[ $? -eq 0 ] || _notrun "python pcp pmda module not installed"

if [ ! -d /sys/class/infiniband ]; then
# Check if Infiniband kernel module is loaded
lsmod | grep -q '^ib_core' || _notrun "IB kernel modules are not loaded"
_notrun "No RoCE devices detected"
fi

status=1 # failure is the default!
trap "_cleanup_pmda rocestat; exit \$status" 0 1 2 3 15

pmdarocestat_filter()
{
sed \
-e "s/Ran [0-9]* tests in [0-9]*\.[0-9][0-9]*s/Ran X tests in YYYs/" \
-e "s/FAILED (failures=[0-9]*)/FAILED (failures=X)/" \
-e "/^$/d"
}

pmdarocestat_remove()
{
cd $pmda_path
echo
echo "=== Removing ROCESTAT agent ==="
$sudo ./Remove >$tmp.out 2>&1
}

pmdarocestat_install()
{
cd $pmda_path
$sudo ./Remove >/dev/null 2>&1

echo
echo "=== Installing ROCESTAT agent ==="
$sudo ./Install </dev/null >$tmp.out 2>&1
cd $here
}

run_rocestat_test()
{
echo
echo "=== Running ROCESTAT test script ==="
$sudo $python $test_script 2>&1 | LC_COLLATE=POSIX sort | pmdarocestat_filter
}

# Real QA test starts here
_prepare_pmda rocestat

pmdarocestat_install

run_rocestat_test

pmdarocestat_remove

# Success, all done
status=0
exit
21 changes: 21 additions & 0 deletions qa/1993.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
QA output created by 1993

=== Installing ROCESTAT agent ===

=== Running ROCESTAT test script ===
----------------------------------------------------------------------
Fetching Rocestat PMDA metrics
OK
Ran X tests in YYYs
test_hw_link_metrics (__main__.RocestatTests) ... ok
test_hw_mcast_metrics (__main__.RocestatTests) ... ok
test_hw_metrics (__main__.RocestatTests) ... ok
test_hw_rcv_metrics (__main__.RocestatTests) ... ok
test_hw_req_metrics (__main__.RocestatTests) ... ok
test_hw_resp_metrics (__main__.RocestatTests) ... ok
test_hw_rnr_metrics (__main__.RocestatTests) ... ok
test_hw_ucast_metrics (__main__.RocestatTests) ... ok
test_hw_xmit_metrics (__main__.RocestatTests) ... ok
test_nic_metrics (__main__.RocestatTests) ... ok

=== Removing ROCESTAT agent ===
2 changes: 1 addition & 1 deletion qa/GNUmakefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ TESTS = $(shell sed -n -e '/^[0-9][0-9]*:retired/d' -e '/^[0-9][0-9]*:reserved/d
SUBDIRS = src pmdas cisco gluster pconf sadist collectl nfsclient named \
archives badarchives views qt linux unbound cifs gpfs lustre ganglia \
java mmv postfix perl json slurm tmparch sheet smart admin hacluster \
sockets denki gfs2 farm pdudata
sockets denki gfs2 farm pdudata rocestat

ifeq "$(PMDA_PERFEVENT)" "true"
SUBDIRS += perfevent
Expand Down
2 changes: 1 addition & 1 deletion qa/GNUmakefile.install
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ ifeq "$(PMDA_PERFEVENT)" "true"
SUBDIRS += perfevent
endif
ifeq "$(HAVE_PYTHON)" "true"
SUBDIRS += secure mic haproxy lio openmetrics
SUBDIRS += secure mic haproxy lio openmetrics rocestat
endif

# Before installing the pcpqa policy module we need a PCP build with
Expand Down
2 changes: 2 additions & 0 deletions qa/group
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ pmda.postgresql
pmda.proc
pmda.redis
pmda.resctrl
pmda.rocestat
pmda.root
#pmda.rpm # note this group has been retired
pmda.rsyslog
Expand Down Expand Up @@ -2228,4 +2229,5 @@ pmcd.pdu
1990 pcp buddyinfo python local
1991 pcp netstat python local
1992 pmda.uwsgi local
1993 pmda.rocestat local python
4751 libpcp threads valgrind local pcp helgrind
22 changes: 22 additions & 0 deletions qa/rocestat/GNUmakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!gmake

TOPDIR = ../..
include $(TOPDIR)/src/include/builddefs

TESTDIR = $(PCP_VAR_DIR)/testsuite/rocestat
MYFILES = rocestat.json
PYMODULES = rocestat_test.python
LDIRT = rocestat_test.py rocestat_test.pyc rocestat_test.pyo

default default_pcp setup: rocestat_test.py

install install_pcp:
$(INSTALL) -m 755 -d $(TESTDIR)
$(INSTALL) -m 644 -f $(MYFILES) $(TESTDIR)/$(MYFILES)
$(INSTALL) -m 755 -f $(PYMODULES) $(TESTDIR)/$(PYMODULES)
$(INSTALL) -m 644 -f GNUmakefile.install $(TESTDIR)/GNUmakefile

include $(BUILDRULES)

%.py : %.python
$(LN_S) $< $@
20 changes: 20 additions & 0 deletions qa/rocestat/GNUmakefile.install
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!gmake

ifdef PCP_CONF
include $(PCP_CONF)
else
include $(PCP_DIR)/etc/pcp.conf
endif
PATH = $(shell . $(PCP_DIR)/etc/pcp.env; echo $$PATH)
include $(PCP_INC_DIR)/builddefs

TESTDIR = $(PCP_VAR_DIR)/testsuite/rocestat

default default_pcp setup: rocestat_test.py

install install_pcp:

include $(BUILDRULES)

%.py : %.python
$(LN_S) $< $@
29 changes: 29 additions & 0 deletions qa/rocestat/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
Pre-Requisites
--------------
In order to perform the tests, we'll make some assumptions:

- The `pcp` Python module is installed and accessible.
- The `rocestat` PMDA has been installed successfully.
- The system has RoCE devices available and the required kernel modules (`ib_core`) are loaded.
- You are able to run the test as the root user (required for PMDA installation and removal).

What the Test Does
------------------
The test validates the functionality of the `rocestat` PMDA by performing the following steps:

1. Installs the `rocestat` PMDA.
2. Fetches metrics defined in `metrics.list` and validates their values against the expected values in `rocestat.json`.
3. Runs unit tests for each metric cluster (e.g., `hw`, `hw_xmit`, `hw_rcv`) to ensure the PMDA is reporting correct values.
4. Removes the `rocestat` PMDA after the tests are complete.

If any of the above checks fail, an exception will be raised, and the test will terminate.

How to Test
-----------
1. Make sure the pre-requisites are in place.
2. Run the rocestat_test.python script

Issues
------
- None known at this time

49 changes: 49 additions & 0 deletions qa/rocestat/metrics.list
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
rocestat.lane.rx_pause
rocestat.lane.rx_bytes
rocestat.lane.tx_bytes
rocestat.hw.link.link_error_recovery
rocestat.hw.link.link_downed
rocestat.hw.link.local_link_integrity_errors
rocestat.hw.rnr.rnr_nak_retry_err
rocestat.hw.resp.resp_local_length_error
rocestat.hw.resp.resp_cqe_error
rocestat.hw.resp.resp_cqe_flush_error
rocestat.hw.resp.resp_remote_access_errors
rocestat.hw.req.req_remote_invalid_request
rocestat.hw.req.req_cqe_error
rocestat.hw.req.req_cqe_flush_error
rocestat.hw.req.duplicate_request
rocestat.hw.req.rx_read_requests
rocestat.hw.req.rx_atomic_requests
rocestat.hw.req.req_remote_access_errors
rocestat.hw.req.rx_write_requests
rocestat.hw.mcast.multicast_rcv_packets
rocestat.hw.mcast.multicast_xmit_packets
rocestat.hw.ucast.unicast_rcv_packets
rocestat.hw.ucast.unicast_xmit_packets
rocestat.hw.rcv.port_rcv_errors
rocestat.hw.rcv.port_rcv_remote_physical_errors
rocestat.hw.rcv.port_rcv_packets
rocestat.hw.rcv.port_rcv_data
rocestat.hw.rcv.port_rcv_constraint_errors
rocestat.hw.rcv.port_rcv_switch_relay_errors
rocestat.hw.xmit.port_xmit_data
rocestat.hw.xmit.port_xmit_constraint_errors
rocestat.hw.xmit.port_xmit_wait
rocestat.hw.xmit.port_xmit_packets
rocestat.hw.xmit.port_xmit_discards
rocestat.hw.roce_slow_restart_trans
rocestat.hw.roce_slow_restart_cnps
rocestat.hw.roce_slow_restart
rocestat.hw.roce_adp_retrans_to
rocestat.hw.clear_counters
rocestat.hw.local_ack_timeout_err
rocestat.hw.lifespan
rocestat.hw.implied_nak_seq_err
rocestat.hw.packet_seq_err
rocestat.hw.roce_adp_retrans
rocestat.hw.out_of_buffer
rocestat.hw.out_of_sequence
rocestat.hw.VL15_dropped
rocestat.hw.excessive_buffer_overrun_errors
rocestat.hw.symbol_error
Loading
Loading